diff options
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 17 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/msr-index.h | 3 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/vmx.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 144 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 73 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 112 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 20 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 240 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 42 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 3 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 1 |
14 files changed, 514 insertions, 170 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d89c6b828c96..4327af53e544 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -51,7 +51,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL -#define CR3_PCID_INVD (1UL << 63) +#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -160,6 +160,18 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -615,6 +627,8 @@ struct kvm_arch { #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif + + bool boot_vcpu_runs_old_kvmclock; }; struct kvm_vm_stat { @@ -753,6 +767,7 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c8aa65d56027..d0050f25ea80 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -356,6 +356,9 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b +#define MSR_IA32_SMBASE 0x0000009e + #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b813bf9da1e2..ff2b8e28883e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -56,6 +56,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 @@ -116,10 +117,14 @@ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } +#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 + #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 169b09d76ddd..ef23c1e5fa9f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -86,6 +86,7 @@ #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) +#define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) @@ -169,6 +170,7 @@ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ +#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -262,6 +264,13 @@ struct instr_dual { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 +enum x86_transfer_type { + X86_TRANSFER_NONE, + X86_TRANSFER_CALL_JMP, + X86_TRANSFER_RET, + X86_TRANSFER_TASK_SWITCH, +}; + static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (!(ctxt->regs_valid & (1 << nr))) { @@ -1057,8 +1066,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstcw %0": "+m"(fcw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fcw; return X86EMUL_CONTINUE; @@ -1075,8 +1082,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstsw %0": "+m"(fsw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fsw; return X86EMUL_CONTINUE; @@ -1223,6 +1228,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); + /* Increment ESP on POP [ESP] */ + if ((ctxt->d & IncSP) && + base_reg == VCPU_REGS_RSP) + modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; @@ -1435,10 +1444,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, ops->get_gdt(ctxt, dt); } -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc, - ulong *desc_addr_p) +static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, + u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1449,8 +1456,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - *desc_addr_p = addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + addr = dt.address + index * 8; + +#ifdef CONFIG_X86_64 + if (addr >> 32 != 0) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (!(efer & EFER_LMA)) + addr &= (u32)-1; + } +#endif + + *desc_addr_p = addr; + return X86EMUL_CONTINUE; +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) +{ + int rc; + + rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); + if (rc != X86EMUL_CONTINUE) + return rc; + + return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), &ctxt->exception); } @@ -1458,16 +1491,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { - struct desc_ptr dt; - u16 index = selector >> 3; + int rc; ulong addr; - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); + rc = get_descriptor_ptr(ctxt, selector, &addr); + if (rc != X86EMUL_CONTINUE) + return rc; - addr = dt.address + index * 8; return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1475,7 +1505,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, - bool in_task_switch, + enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; @@ -1529,11 +1559,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; + err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : + GP_VECTOR; /* can't load system descriptor into segment selector */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) + if (seg <= VCPU_SREG_GS && !seg_desc.s) { + if (transfer == X86_TRANSFER_CALL_JMP) + return X86EMUL_UNHANDLEABLE; goto exception; + } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; @@ -1605,10 +1639,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (seg_desc.s) { /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (!(seg_desc.type & 1)) { + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, selector, + &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, sizeof(base3), &ctxt->exception); @@ -1631,7 +1668,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) @@ -1828,12 +1866,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) unsigned long selector; int rc; - rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); + rc = emulate_pop(ctxt, &selector, 2); if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + if (ctxt->op_bytes > 2) + rsp_increment(ctxt, ctxt->op_bytes - 2); rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; @@ -2041,7 +2081,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2130,7 +2171,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, + X86_TRANSFER_RET, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2567,23 +2609,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2705,31 +2747,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, - cpl, true, NULL); + cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2750,7 +2792,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss32(ctxt, &tss_seg); @@ -2759,13 +2800,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2776,7 +2815,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -3015,10 +3053,10 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, - &new_desc); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + return rc; rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) @@ -3749,7 +3787,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -3865,7 +3903,7 @@ static const struct gprefix pfx_0f_e7 = { }; static const struct escape escape_d9 = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstcw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3907,7 +3945,7 @@ static const struct escape escape_db = { { } }; static const struct escape escape_dd = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstsw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -4871,8 +4909,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* optimisation - avoid slow emulated read if Mov */ rc = segmented_read(ctxt, ctxt->dst.addr.mem, &ctxt->dst.val, ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + if (rc == X86EMUL_PROPAGATE_FAULT && + ctxt->exception.vector == PF_VECTOR) + ctxt->exception.error_code |= PFERR_WRITE_MASK; goto done; + } } ctxt->dst.orig_val = ctxt->dst.val; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4f0c0b954686..a688fbffb34e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> +#include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" @@ -402,7 +403,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -450,7 +451,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -1073,25 +1074,72 @@ static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. - */ if (atomic_read(&apic->lapic_timer.pending)) return; atomic_inc(&apic->lapic_timer.pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_set_pending_timer(vcpu); if (waitqueue_active(q)) wake_up_interruptible(q); + + if (apic_lvtt_tscdeadline(apic)) + ktimer->expired_tscdeadline = ktimer->tscdeadline; +} + +/* + * On APICv, this test will cause a busy wait + * during a higher-priority task. + */ + +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + + if (kvm_apic_hw_enabled(apic)) { + int vec = reg & APIC_VECTOR_MASK; + + if (kvm_x86_ops->test_posted_interrupt) + return kvm_x86_ops->test_posted_interrupt(vcpu, vec); + else { + if (apic_test_vector(vec, apic->regs + APIC_ISR)) + return true; + } + } + return false; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (apic->lapic_timer.expired_tscdeadline == 0) + return; + + if (!lapic_timer_int_injected(vcpu)) + return; + + tsc_deadline = apic->lapic_timer.expired_tscdeadline; + apic->lapic_timer.expired_tscdeadline = 0; + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + + /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ + if (guest_tsc < tsc_deadline) + __delay(tsc_deadline - guest_tsc); } static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; + atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { @@ -1137,6 +1185,7 @@ static void start_apic_timer(struct kvm_lapic *apic) /* lapic timer in tsc deadline mode */ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; + ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; @@ -1151,8 +1200,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1742,7 +1793,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, if (kvm_x86_ops->hwapic_irr_update) kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + if (unlikely(kvm_x86_ops->hwapic_isr_update)) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c674fce53cf9..7054437944cd 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -14,6 +14,7 @@ struct kvm_timer { u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; + u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ }; @@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +void wait_lapic_expire(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c5e0ba..97898abe8386 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -63,30 +63,16 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG +static bool dbg = 0; +module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - +#define MMU_WARN_ON(x) WARN_ON(x) #else - #define pgprintk(x...) do { } while (0) #define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } +#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 @@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } +static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) != (new_spte & bit_mask); +} + /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (!shadow_accessed_mask) return ret; + /* + * Flush TLB when accessed/dirty bits are changed in the page tables, + * to guarantee consistency between TLB and page tables. + */ + if (spte_is_bit_changed(old_spte, new_spte, + shadow_accessed_mask | shadow_dirty_mask)) + ret = true; + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) @@ -1536,7 +1535,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { - ASSERT(is_empty_shadow_page(sp->spt)); + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -3041,7 +3040,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), @@ -3079,7 +3078,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); @@ -3104,7 +3103,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { @@ -3329,8 +3328,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGE_SHIFT; @@ -3396,8 +3394,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, gpa, error_code, true); @@ -3718,7 +3715,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - ASSERT(is_pae(vcpu)); + MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -3763,7 +3760,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; context->page_fault = tdp_page_fault; @@ -3803,11 +3800,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_last_pte_bitmap(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -3818,19 +3816,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else paging32_init_context(vcpu, context); - vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp + context->base_role.nxe = is_nx(vcpu); + context->base_role.cr4_pae = !!is_pae(vcpu); + context->base_role.cr0_wp = is_write_protection(vcpu); + context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); context->shadow_root_level = kvm_x86_ops->get_tdp_level(); @@ -3851,11 +3849,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + struct kvm_mmu *context = &vcpu->arch.mmu; + + kvm_init_shadow_mmu(vcpu); + context->set_cr3 = kvm_x86_ops->set_cr3; + context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; + context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) @@ -3900,17 +3900,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu); else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu); else - return init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } @@ -4266,8 +4264,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - ASSERT(vcpu); - /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -4286,8 +4282,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; @@ -4298,8 +4292,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) void kvm_mmu_setup(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); init_kvm_mmu(vcpu); } @@ -4309,6 +4302,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) struct kvm_memory_slot *memslot; gfn_t last_gfn; int i; + bool flush = false; memslot = id_to_memslot(kvm->memslots, slot); last_gfn = memslot->base_gfn + memslot->npages - 1; @@ -4325,7 +4319,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) - __rmap_write_protect(kvm, rmapp, false); + flush |= __rmap_write_protect(kvm, rmapp, + false); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); @@ -4352,7 +4347,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); } #define BATCH_ZAP_PAGES 10 @@ -4606,8 +4602,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bde8ee725754..c7d65637c851 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,18 +44,6 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 - -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) - static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -81,9 +69,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41dd0387cccb..a17d848c6d42 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_mmu(vcpu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c2a34bb5ad93..587149bd6f76 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -914,6 +914,26 @@ TRACE_EVENT(kvm_pvclock_update, __entry->flags) ); +TRACE_EVENT(kvm_wait_lapic_expire, + TP_PROTO(unsigned int vcpu_id, s64 delta), + TP_ARGS(vcpu_id, delta), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( s64, delta ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->delta = delta; + ), + + TP_printk("vcpu %u: delta %lld (%s)", + __entry->vcpu_id, + __entry->delta, + __entry->delta < 0 ? "early" : "late") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4c58d884838..c987374d92c1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -435,6 +435,11 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static int pi_test_pir(int vector, struct pi_desc *pi_desc) +{ + return test_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -959,16 +964,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; @@ -5895,7 +5890,9 @@ static __init int hardware_setup(void) kvm_x86_ops->update_cr8_intercept = NULL; else { kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->hwapic_isr_update = NULL; kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->test_posted_interrupt = NULL; kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; } @@ -6143,6 +6140,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); +} + static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = @@ -6960,6 +6964,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return pi_test_pir(vector, &vmx->pi_desc); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7471,9 +7482,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) u16 status; u8 old; - if (!vmx_vm_has_apicv(kvm)) - return; - if (isr == -1) isr = 0; @@ -8184,9 +8192,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_ept_mmu(vcpu, nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); - vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -8199,6 +8207,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -8206,8 +8226,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -8286,6 +8305,162 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field, + int maxphyaddr) +{ + u64 count, addr; + + if (vmcs12_read_any(vcpu, count_field, &count) || + vmcs12_read_any(vcpu, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_warn_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + return -EINVAL; + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + if (kvm_get_msr(vcpu, e.index, &e.value)) { + pr_warn_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_write_guest(vcpu->kvm, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &e.value, sizeof(e.value))) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + return -EINVAL; + } + } + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -8582,6 +8757,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) int cpu; struct loaded_vmcs *vmcs02; bool ia32e; + u32 msr_entry_idx; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -8629,11 +8805,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -8739,10 +8911,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - vmcs12->launch_state = 1; - prepare_vmcs02(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); @@ -9172,6 +9355,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* @@ -9193,6 +9380,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -9374,6 +9565,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hwapic_isr_update = vmx_hwapic_isr_update, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .test_posted_interrupt = vmx_test_pir, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c259814200bd..917672f8034a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ +unsigned int lapic_timer_advance_ns = 0; +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); + static bool backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -492,7 +496,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, +static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) { return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, @@ -643,7 +647,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1083,6 +1087,15 @@ static void update_pvclock_gtod(struct timekeeper *tk) } #endif +void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +{ + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. This function is only called from + * the physical CPU that is running vcpu. + */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +} static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -1180,7 +1193,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; +static unsigned long max_tsc_khz; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { @@ -1234,7 +1247,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; @@ -1529,7 +1542,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed; + && !backwards_tsc_observed + && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -2161,8 +2175,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { u64 gpa_offset; + struct kvm_arch *ka = &vcpu->kvm->arch; + kvmclock_reset(vcpu); + if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { + bool tmp = (msr == MSR_KVM_SYSTEM_TIME); + + if (ka->boot_vcpu_runs_old_kvmclock != tmp) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); + + ka->boot_vcpu_runs_old_kvmclock = tmp; + } + vcpu->arch.time = data; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); @@ -2324,6 +2350,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +EXPORT_SYMBOL_GPL(kvm_get_msr); static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { @@ -2738,6 +2765,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: + case KVM_CAP_TSC_DEADLINE_TIMER: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2776,9 +2804,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; default: r = 0; break; @@ -6311,6 +6336,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cc1d61af6140..f5fef1868096 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int lapic_timer_advance_ns; + extern struct static_key kvm_no_apic_vcpu; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cc6e2e19982..167e8c14b143 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1593,6 +1593,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } +EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) |