summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/kvm_host.h17
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h3
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
-rw-r--r--arch/x86/kvm/emulate.c144
-rw-r--r--arch/x86/kvm/lapic.c73
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu.c112
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/trace.h20
-rw-r--r--arch/x86/kvm/vmx.c240
-rw-r--r--arch/x86/kvm/x86.c42
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--virt/kvm/kvm_main.c1
14 files changed, 514 insertions, 170 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..4327af53e544 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -51,7 +51,7 @@
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD (1UL << 63)
+#define CR3_PCID_INVD BIT_64(63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -160,6 +160,18 @@ enum {
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -615,6 +627,8 @@ struct kvm_arch {
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
+
+ bool boot_vcpu_runs_old_kvmclock;
};
struct kvm_vm_stat {
@@ -753,6 +767,7 @@ struct kvm_x86_ops {
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+ bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c8aa65d56027..d0050f25ea80 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -356,6 +356,9 @@
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
#define MSR_IA32_PERF_STATUS 0x00000198
#define MSR_IA32_PERF_CTL 0x00000199
#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..ff2b8e28883e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -116,10 +117,14 @@
{ EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
{ EXIT_REASON_INVD, "INVD" }, \
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 169b09d76ddd..ef23c1e5fa9f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
#define DstAcc (OpAcc << DstShift)
#define DstDI (OpDI << DstShift)
#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
#define DstImmUByte (OpImmUByte << DstShift)
#define DstDX (OpDX << DstShift)
#define DstAccLo (OpAccLo << DstShift)
@@ -169,6 +170,7 @@
#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -262,6 +264,13 @@ struct instr_dual {
#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
#define EFLG_RESERVED_ONE_MASK 2
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
{
if (!(ctxt->regs_valid & (1 << nr))) {
@@ -1057,8 +1066,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstcw %0": "+m"(fcw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fcw;
return X86EMUL_CONTINUE;
@@ -1075,8 +1082,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstsw %0": "+m"(fsw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fsw;
return X86EMUL_CONTINUE;
@@ -1223,6 +1228,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
else {
modrm_ea += reg_read(ctxt, base_reg);
adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
}
if (index_reg != 4)
modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1444,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
ops->get_gdt(ctxt, dt);
}
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc,
- ulong *desc_addr_p)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
@@ -1449,8 +1456,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- *desc_addr_p = addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
&ctxt->exception);
}
@@ -1458,16 +1491,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_struct *desc)
{
- struct desc_ptr dt;
- u16 index = selector >> 3;
+ int rc;
ulong addr;
- get_descriptor_table_ptr(ctxt, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- addr = dt.address + index * 8;
return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
&ctxt->exception);
}
@@ -1475,7 +1505,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/* Does not support long mode */
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg, u8 cpl,
- bool in_task_switch,
+ enum x86_transfer_type transfer,
struct desc_struct *desc)
{
struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1559,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
err_code = selector & 0xfffc;
- err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
/* can't load system descriptor into segment selector */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
goto exception;
+ }
if (!seg_desc.p) {
err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1639,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (seg_desc.s) {
/* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
sizeof(base3), &ctxt->exception);
@@ -1631,7 +1668,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
u8 cpl = ctxt->ops->cpl(ctxt);
- return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
}
static void write_register_operand(struct operand *op)
@@ -1828,12 +1866,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
unsigned long selector;
int rc;
- rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &selector, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->modrm_reg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
@@ -2041,7 +2081,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2130,7 +2171,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2567,23 +2609,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2705,31 +2747,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
- cpl, true, NULL);
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2750,7 +2792,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
save_state_to_tss32(ctxt, &tss_seg);
@@ -2759,13 +2800,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
ldt_sel_offset - eip_offset, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
if (old_tss_sel != 0xffff) {
@@ -2776,7 +2815,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
sizeof tss_seg.prev_task_link,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
}
@@ -3015,10 +3053,10 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
- &new_desc);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
if (rc != X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
+ return rc;
rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE)
@@ -3749,7 +3787,7 @@ static const struct opcode group1[] = {
};
static const struct opcode group1A[] = {
- I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
};
static const struct opcode group2[] = {
@@ -3865,7 +3903,7 @@ static const struct gprefix pfx_0f_e7 = {
};
static const struct escape escape_d9 = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -3907,7 +3945,7 @@ static const struct escape escape_db = { {
} };
static const struct escape escape_dd = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -4871,8 +4909,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
/* optimisation - avoid slow emulated read if Mov */
rc = segmented_read(ctxt, ctxt->dst.addr.mem,
&ctxt->dst.val, ctxt->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != X86EMUL_CONTINUE) {
+ if (rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
goto done;
+ }
}
ctxt->dst.orig_val = ctxt->dst.val;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4f0c0b954686..a688fbffb34e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -402,7 +403,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
else {
++apic->isr_count;
@@ -450,7 +451,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
else {
@@ -1073,25 +1074,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
- /*
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
- * vcpu_enter_guest.
- */
if (atomic_read(&apic->lapic_timer.pending))
return;
atomic_inc(&apic->lapic_timer.pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_set_pending_timer(vcpu);
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (apic_lvtt_tscdeadline(apic))
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+
+ if (kvm_x86_ops->test_posted_interrupt)
+ return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+ else {
+ if (apic_test_vector(vec, apic->regs + APIC_ISR))
+ return true;
+ }
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (apic->lapic_timer.expired_tscdeadline == 0)
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
+ if (guest_tsc < tsc_deadline)
+ __delay(tsc_deadline - guest_tsc);
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1185,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1200,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
@@ -1742,7 +1793,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..7054437944cd 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..97898abe8386 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
#undef MMU_DEBUG
#ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
+#define MMU_WARN_ON(x) WARN_ON(x)
#else
-
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
+#define MMU_WARN_ON(x) do { } while (0)
#endif
#define PTE_PREFETCH_NUM 8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
+
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
if (!shadow_accessed_mask)
return ret;
+ /*
+ * Flush TLB when accessed/dirty bits are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+ if (spte_is_bit_changed(old_spte, new_spte,
+ shadow_accessed_mask | shadow_dirty_mask))
+ ret = true;
+
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1536,7 +1535,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
- ASSERT(is_empty_shadow_page(sp->spt));
+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@@ -3041,7 +3040,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3078,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
@@ -3104,7 +3103,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3328,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
if (r)
return r;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
gfn = gva >> PAGE_SHIFT;
@@ -3396,8 +3394,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3715,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
- ASSERT(is_pae(vcpu));
+ MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
@@ -3763,7 +3760,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
context->page_fault = tdp_page_fault;
@@ -3803,11 +3800,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
update_last_pte_bitmap(vcpu, context);
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
{
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -3818,19 +3816,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
else
paging32_init_context(vcpu, context);
- vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
- vcpu->arch.mmu.base_role.smep_andnot_wp
+ context->base_role.nxe = is_nx(vcpu);
+ context->base_role.cr4_pae = !!is_pae(vcpu);
+ context->base_role.cr0_wp = is_write_protection(vcpu);
+ context->base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
@@ -3851,11 +3849,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
- vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
- vcpu->arch.walk_mmu->get_cr3 = get_cr3;
- vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ kvm_init_shadow_mmu(vcpu);
+ context->set_cr3 = kvm_x86_ops->set_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
}
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +3900,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
{
if (mmu_is_nested(vcpu))
- return init_kvm_nested_mmu(vcpu);
+ init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
+ init_kvm_tdp_mmu(vcpu);
else
- return init_kvm_softmmu(vcpu);
+ init_kvm_softmmu(vcpu);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
init_kvm_mmu(vcpu);
}
@@ -4266,8 +4264,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
struct page *page;
int i;
- ASSERT(vcpu);
-
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4282,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,8 +4292,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
init_kvm_mmu(vcpu);
}
@@ -4309,6 +4302,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
struct kvm_memory_slot *memslot;
gfn_t last_gfn;
int i;
+ bool flush = false;
memslot = id_to_memslot(kvm->memslots, slot);
last_gfn = memslot->base_gfn + memslot->npages - 1;
@@ -4325,7 +4319,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
for (index = 0; index <= last_index; ++index, ++rmapp) {
if (*rmapp)
- __rmap_write_protect(kvm, rmapp, false);
+ flush |= __rmap_write_protect(kvm, rmapp,
+ false);
if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
@@ -4352,7 +4347,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
* instead of PT_WRITABLE_MASK, that means it does not depend
* on PT_WRITABLE_MASK anymore.
*/
- kvm_flush_remote_tlbs(kvm);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
}
#define BATCH_ZAP_PAGES 10
@@ -4606,8 +4602,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
};
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool ept);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..a17d848c6d42 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_mmu(vcpu);
vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..587149bd6f76 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -914,6 +914,26 @@ TRACE_EVENT(kvm_pvclock_update,
__entry->flags)
);
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..c987374d92c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -435,6 +435,11 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static int pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -959,16 +964,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
- return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
- return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
static inline bool cpu_has_vmx_ept_2m_page(void)
{
return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -5895,7 +5890,9 @@ static __init int hardware_setup(void)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
kvm_x86_ops->hwapic_irr_update = NULL;
+ kvm_x86_ops->hwapic_isr_update = NULL;
kvm_x86_ops->deliver_posted_interrupt = NULL;
+ kvm_x86_ops->test_posted_interrupt = NULL;
kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
}
@@ -6143,6 +6140,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
*/
}
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
{
struct vcpu_vmx *vmx =
@@ -6960,6 +6964,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
}
+static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return pi_test_pir(vector, &vmx->pi_desc);
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7471,9 +7482,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
u16 status;
u8 old;
- if (!vmx_vm_has_apicv(kvm))
- return;
-
if (isr == -1)
isr = 0;
@@ -8184,9 +8192,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_ept_mmu(vcpu,
nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
-
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8207,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
}
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -8206,8 +8226,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
@@ -8286,6 +8305,162 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field,
+ int maxphyaddr)
+{
+ u64 count, addr;
+
+ if (vmcs12_read_any(vcpu, count_field, &count) ||
+ vmcs12_read_any(vcpu, addr_field, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ if (count == 0)
+ return 0;
+ if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ pr_warn_ratelimited(
+ "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ addr_field, maxphyaddr, count, addr);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int maxphyaddr;
+
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ return -EINVAL;
+ if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ e->index == MSR_IA32_UCODE_REV)
+ return -EINVAL;
+ if (e->reserved != 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+ e->index == MSR_GS_BASE ||
+ e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ return -EINVAL;
+ }
+ if (nested_vmx_store_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ return -EINVAL;
+ }
+ if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR (%u, 0x%x)\n",
+ __func__, i, e.index);
+ return -EINVAL;
+ }
+ if (kvm_write_guest(vcpu->kvm,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &e.value, sizeof(e.value))) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8582,6 +8757,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
int cpu;
struct loaded_vmcs *vmcs02;
bool ia32e;
+ u32 msr_entry_idx;
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -8629,11 +8805,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if (vmcs12->vm_entry_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_store_count > 0) {
- pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
- __func__);
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
@@ -8739,10 +8911,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- vmcs12->launch_state = 1;
-
prepare_vmcs02(vcpu, vmcs12);
+ msr_entry_idx = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (msr_entry_idx) {
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+ return 1;
+ }
+
+ vmcs12->launch_state = 1;
+
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
return kvm_emulate_halt(vcpu);
@@ -9172,6 +9355,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
}
/*
@@ -9193,6 +9380,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
vmx_load_vmcs01(vcpu);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9374,6 +9565,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .test_posted_interrupt = vmx_test_pir,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..917672f8034a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
static bool backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -492,7 +496,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
void *data, int offset, int len, u32 access)
{
return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +647,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
}
}
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1087,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
}
#endif
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
@@ -1180,7 +1193,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
@@ -1234,7 +1247,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
@@ -1529,7 +1542,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
- && !backwards_tsc_observed;
+ && !backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2175,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
u64 gpa_offset;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
kvmclock_reset(vcpu);
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
@@ -2324,6 +2350,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
@@ -2738,6 +2765,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2776,9 +2804,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
- case KVM_CAP_TSC_DEADLINE_TIMER:
- r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
- break;
default:
r = 0;
break;
@@ -6311,6 +6336,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
+extern unsigned int lapic_timer_advance_ns;
+
extern struct static_key kvm_no_apic_vcpu;
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cc6e2e19982..167e8c14b143 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1593,6 +1593,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
}
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_write_guest);
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)