From 8e39efd840b8d4eae5ab398b43e20ffaff0010cc Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 10 May 2022 15:40:34 -0700 Subject: KVM: VMX: Print VM-instruction error when it may be helpful Include the value of the "VM-instruction error" field from the current VMCS (if any) in the error message for VMCLEAR and VMPTRLD, since each of these instructions may result in more than one VM-instruction error. Previously, this field was only reported for VMWRITE errors. Signed-off-by: David Matlack [Rebased and refactored code; dropped the error number for INVVPID and INVEPT; reworded commit message.] Signed-off-by: Jim Mattson Message-Id: <20220510224035.1792952-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f5aeade623d6..673ba5ca0beb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -392,12 +392,14 @@ noinline void vmwrite_error(unsigned long field, unsigned long value) noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) -- cgit v1.2.3 From cc07e60b0811eeeca769fb342aa6e13da5977657 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 10 May 2022 15:40:35 -0700 Subject: KVM: VMX: Print VM-instruction error as unsigned Change the printf format character from 'd' to 'u' for the VM-instruction error in vmwrite_error(). Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Reported-by: Sean Christopherson Signed-off-by: Jim Mattson Message-Id: <20220510224035.1792952-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 673ba5ca0beb..e0d3bea73b28 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -386,7 +386,7 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } -- cgit v1.2.3 From 2d61391270a3ceb95b3dd536ea13002e653323b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:33 +0200 Subject: KVM: x86: Differentiate Soft vs. Hard IRQs vs. reinjected in tracepoint In the IRQ injection tracepoint, differentiate between Hard IRQs and Soft "IRQs", i.e. interrupts that are reinjected after incomplete delivery of a software interrupt from an INTn instruction. Tag reinjected interrupts as such, even though the information is usually redundant since soft interrupts are only ever reinjected by KVM. Though rare in practice, a hard IRQ can be reinjected. Signed-off-by: Sean Christopherson [MSS: change "kvm_inj_virq" event "reinjected" field type to bool] Signed-off-by: Maciej S. Szmigiero Message-Id: <9664d49b3bd21e227caa501cff77b0569bebffe2.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e1aa14743cdb..9714ae95589f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4573,13 +4573,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { -- cgit v1.2.3 From 1ad4e5438c67a01620ed67cea959de89f4430515 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:00 +0800 Subject: KVM: VMX: Detect Tertiary VM-Execution control when setup VMCS config Check VMX features on tertiary execution control in VMCS config setup. Sub-features in tertiary execution control to be enabled are adjusted according to hardware capabilities although no sub-feature is enabled in this patch. EVMCSv1 doesn't support tertiary VM-execution control, so disable it when EVMCSv1 is in use. And define the auxiliary functions for Tertiary control field here, using the new BUILD_CONTROLS_SHADOW(). Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153400.11642-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9714ae95589f..9d3d41b21059 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2412,6 +2412,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2420,6 +2429,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; @@ -2441,7 +2451,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2515,6 +2526,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, "1-setting enable VPID VM-execution control\n"); } + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = 0; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); + } + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; @@ -2601,6 +2619,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -4222,6 +4241,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4387,6 +4411,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); -- cgit v1.2.3 From 0b85baa5f46de1c6ad6e4b987905df041f2f80f0 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Tue, 19 Apr 2022 23:34:41 +0800 Subject: KVM: VMX: Report tertiary_exec_control field in dump_vmcs() Add tertiary_exec_control field report in dump_vmcs(). Meanwhile, reorganize the dump output of VMCS category as follows. Before change: *** Control State *** PinBased=0x000000ff CPUBased=0xb5a26dfa SecondaryExec=0x061037eb EntryControls=0000d1ff ExitControls=002befff After change: *** Control State *** CPUBased=0xb5a26dfa SecondaryExec=0x061037eb TertiaryExec=0x0000000000000010 PinBased=0x000000ff EntryControls=0000d1ff ExitControls=002befff Reviewed-by: Maxim Levitsky Signed-off-by: Robert Hoo Signed-off-by: Zeng Guang Message-Id: <20220419153441.11687-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9d3d41b21059..4238a55c26e6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5872,6 +5872,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5885,9 +5886,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -5987,9 +5995,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), -- cgit v1.2.3 From f08a06c9a35706349f74b7a18deefe3f89f73e8e Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:36:04 +0800 Subject: KVM: VMX: Clean up vmx_refresh_apicv_exec_ctrl() Remove the condition check cpu_has_secondary_exec_ctrls(). Calling vmx_refresh_apicv_exec_ctrl() premises secondary controls activated and VMCS fields related to APICv valid as well. If it's invoked in wrong circumstance at the worst case, VMX operation will report VMfailValid error without further harmful impact and just functions as if all the secondary controls were 0. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419153604.11786-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4238a55c26e6..3cbe3a38b356 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4201,16 +4201,15 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - } + + if (kvm_vcpu_apicv_active(vcpu)) + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); vmx_update_msr_bitmap_x2apic(vcpu); } -- cgit v1.2.3 From d588bb9be1da6aa750aa64875fe57369db983d8b Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 19 Apr 2022 23:45:10 +0800 Subject: KVM: VMX: enable IPI virtualization With IPI virtualization enabled, the processor emulates writes to APIC registers that would send IPIs. The processor sets the bit corresponding to the vector in target vCPU's PIR and may send a notification (IPI) specified by NDST and NV fields in target vCPU's Posted-Interrupt Descriptor (PID). It is similar to what IOMMU engine does when dealing with posted interrupt from devices. A PID-pointer table is used by the processor to locate the PID of a vCPU with the vCPU's APIC ID. The table size depends on maximum APIC ID assigned for current VM session from userspace. Allocating memory for PID-pointer table is deferred to vCPU creation, because irqchip mode and VM-scope maximum APIC ID is settled at that point. KVM can skip PID-pointer table allocation if !irqchip_in_kernel(). Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its notification vector to wakeup vector. This can ensure that when an IPI for blocked vCPUs arrives, VMM can get control and wake up blocked vCPUs. And if a VCPU is preempted, its posted interrupt notification is suppressed. Note that IPI virtualization can only virualize physical-addressing, flat mode, unicast IPIs. Sending other IPIs would still cause a trap-like APIC-write VM-exit and need to be handled by VMM. Signed-off-by: Chao Gao Signed-off-by: Zeng Guang Message-Id: <20220419154510.11938-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3cbe3a38b356..f3175dae25aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -2527,7 +2530,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, } if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { - u64 opt3 = 0; + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, MSR_IA32_VMX_PROCBASED_CTLS3); @@ -3874,6 +3877,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -4202,14 +4207,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) + if (kvm_vcpu_apicv_active(vcpu)) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } vmx_update_msr_bitmap_x2apic(vcpu); } @@ -4242,7 +4252,16 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) { - return vmcs_config.cpu_based_3rd_exec_ctrl; + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; } /* @@ -4390,10 +4409,42 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4425,7 +4476,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; @@ -7116,6 +7172,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7750,6 +7810,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7761,7 +7828,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8039,6 +8108,9 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) kvm_has_tsc_control = true; -- cgit v1.2.3 From 39a4d779546a993c53cea28e659e8edc9f868af0 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:32 +0800 Subject: perf/x86/core: Pass "struct kvm_pmu *" to determine the guest values Splitting the logic for determining the guest values is unnecessarily confusing, and potentially fragile. Perf should have full knowledge and control of what values are loaded for the guest. If we change .guest_get_msrs() to take a struct kvm_pmu pointer, then it can generate the full set of guest values by grabbing guest ds_area and pebs_data_cfg. Alternatively, .guest_get_msrs() could take the desired guest MSR values directly (ds_area and pebs_data_cfg), but kvm_pmu is vendor agnostic, so we don't see any reason to not just pass the pointer. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-4-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f3175dae25aa..070b02162db6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6794,9 +6794,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; -- cgit v1.2.3 From 854250329c02c0616a42532d65e81365272326d1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:43 +0800 Subject: KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations The guest PEBS will be disabled when some users try to perf KVM and its user-space through the same PEBS facility OR when the host perf doesn't schedule the guest PEBS counter in a one-to-one mapping manner (neither of these are typical scenarios). The PEBS records in the guest DS buffer are still accurate and the above two restrictions will be checked before each vm-entry only if guest PEBS is deemed to be enabled. Suggested-by: Wei Wang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-15-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 070b02162db6..d5ec0635ccd4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6796,6 +6796,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) -- cgit v1.2.3 From 59cc99f6e971bb24b40e27f695daab98e2eff4b8 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:45 +0800 Subject: KVM: x86/cpuid: Refactor host/guest CPU model consistency check For the same purpose, the leagcy intel_pmu_lbr_is_compatible() can be renamed for reuse by more callers, and remove the comment about LBR use case can be deleted by the way. Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-17-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d5ec0635ccd4..403a8834cc79 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2242,7 +2242,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); -- cgit v1.2.3 From cf8e55fe50df0c0292b389a165daa81193cd39d1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:46 +0800 Subject: KVM: x86/pmu: Expose CPUIDs feature bits PDCM, DS, DTES64 The CPUID features PDCM, DS and DTES64 are required for PEBS feature. KVM would expose CPUID feature PDCM, DS and DTES64 to guest when PEBS is supported in the KVM on the Ice Lake server platforms. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-18-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 403a8834cc79..55a8578255cb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2245,6 +2245,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!cpuid_model_is_consistent(vcpu)) return 1; } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } ret = kvm_set_msr_common(vcpu, msr_info); break; @@ -7517,6 +7528,10 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); -- cgit v1.2.3 From 938c8745bcf2f732ee928a0b9bd592198a88cfa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 May 2022 21:56:23 +0800 Subject: KVM: x86: Introduce "struct kvm_caps" to track misc caps/settings Add kvm_caps to hold a variety of capabilites and defaults that aren't handled by kvm_cpu_caps because they aren't CPUID bits in order to reduce the amount of boilerplate code required to add a new feature. The vast majority (all?) of the caps interact with vendor code and are written only during initialization, i.e. should be tagged __read_mostly, declared extern in x86.h, and exported. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220524135624.22988-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 55a8578255cb..6d631941ac1a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1717,7 +1717,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -7544,7 +7544,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7685,9 +7685,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -8064,8 +8064,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8132,11 +8132,11 @@ static __init int hardware_setup(void) enable_ipiv = false; if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8193,7 +8193,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; -- cgit v1.2.3 From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 24 May 2022 21:56:24 +0800 Subject: KVM: VMX: Enable Notify VM exit There are cases that malicious virtual machines can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. VMM can enable notify VM exit that a VM exit generated if no event window occurs in VM non-root mode for a specified amount of time (notify window). Feature enabling: - The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust the expected notify window. - Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space can query and enable this feature in per-VM scope. The argument is a 64bit value: bits 63:32 are used for notify window, and bits 31:0 are for flags. Current supported flags: - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify window provided. - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen. - It's safe to even set notify window to zero since an internal hardware threshold is added to vmcs.notify_window. VM exit handling: - Introduce a vcpu state notify_window_exits to records the count of notify VM exits and expose it through the debugfs. - Notify VM exit can happen incident to delivery of a vector event. Allow it in KVM. - Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID bit is set. Nested handling - Nested notify VM exits are not supported yet. Keep the same notify window control in vmcs02 as vmcs01, so that L1 can't escape the restriction of notify VM exits through launching L2 VM. Notify VM exit is defined in latest Intel Architecture Instruction Set Extensions Programming Reference, chapter 9.2. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Tao Xu Co-developed-by: Chenyi Qiang Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6d631941ac1a..2e00890d752a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2499,7 +2499,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -4417,6 +4418,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } @@ -4498,6 +4502,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -5784,6 +5791,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5841,6 +5874,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -6214,7 +6248,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -8137,6 +8172,7 @@ static __init int hardware_setup(void) kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; kvm_caps.tsc_scaling_ratio_frac_bits = 48; kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ -- cgit v1.2.3 From 6ef25aa0a961298278301ae1d88106c701eb73fa Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:24 +0800 Subject: KVM: x86/pmu: Restrict advanced features based on module enable_pmu Once vPMU is disabled, the KVM would not expose features like: PEBS (via clear kvm_pmu_cap.pebs_ept), legacy LBR and ARCH_LBR, CPUID 0xA leaf, PDCM bit and MSR_IA32_PERF_CAPABILITIES, plus PT_MODE_HOST_GUEST mode. What this group of features has in common is that their use relies on the underlying PMU counter and the host perf_event as a back-end resource requester or sharing part of the irq delivery path. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2e00890d752a..83eeecb4c7f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7568,6 +7568,9 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); } + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); + if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); @@ -8233,7 +8236,7 @@ static __init int hardware_setup(void) if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; -- cgit v1.2.3 From f5a81d0eb01e0dfebd175edffa7d0a1bdb74d026 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 17:06:57 +0000 Subject: KVM: VMX: Sanitize VM-Entry/VM-Exit control pairs at kvm_intel load time Sanitize the VM-Entry/VM-Exit control pairs (load+load or load+clear) during setup instead of checking both controls in a pair at runtime. If only one control is supported, KVM will report the associated feature as not available, but will leave the supported control bit set in the VMCS config, which could lead to corruption of host state. E.g. if only the VM-Entry control is supported and the feature is not dynamically toggled, KVM will set the control in all VMCSes and load zeros without restoring host state. Note, while this is technically a bug fix, practically speaking no sane CPU or VMM would support only one control. KVM's behavior of checking both controls is mostly pedantry. Cc: Chenyi Qiang Cc: Lei Wang Signed-off-by: Sean Christopherson Message-Id: <20220527170658.3571367-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 83eeecb4c7f7..bc1a0baa5f60 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2446,6 +2446,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2586,6 +2603,20 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear -- cgit v1.2.3 From 3dbec44d9c94d8350a39326561ac40f969c63d16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 17:06:58 +0000 Subject: KVM: VMX: Reject kvm_intel if an inconsistent VMCS config is detected Add an on-by-default module param, error_on_inconsistent_vmcs_config, to allow rejecting the load of kvm_intel if an inconsistent VMCS config is detected. Continuing on with an inconsistent, degraded config is undesirable in the vast majority of use cases, e.g. may result in a misconfigured VM, poor performance due to lack of fast MSR switching, or even security issues in the unlikely event the guest is relying on MPX. Practically speaking, an inconsistent VMCS config should never be encountered in a production quality environment, e.g. on bare metal it indicates a silicon defect (or a disturbing lack of validation by the hardware vendor), and in a virtualized machine (KVM as L1) it indicates a buggy/misconfigured L0 VMM/hypervisor. Provide a module param to override the behavior for testing purposes, or in the unlikely scenario that KVM is deployed on a flawed-but-usable CPU or virtual machine. Note, what is or isn't an inconsistency is somewhat subjective, e.g. one might argue that LOAD_EFER without SAVE_EFER is an inconsistency. KVM's unofficial guideline for an "inconsistency" is either scenarios that are completely nonsensical, e.g. the existing checks on having EPT/VPID knobs without EPT/VPID, and/or scenarios that prevent KVM from virtualizing or utilizing a feature, e.g. the unpaired entry/exit controls checks. Other checks that fall into one or both of the covered scenarios could be added in the future, e.g. asserting that a VMCS control exists available if and only if the associated feature is supported in bare metal. Signed-off-by: Sean Christopherson Message-Id: <20220527170658.3571367-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bc1a0baa5f60..b959fe24c13b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -119,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -2547,15 +2550,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; } if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { @@ -2613,6 +2624,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + if (error_on_inconsistent_vmcs_config) + return -EIO; + _vmentry_control &= ~n_ctrl; _vmexit_control &= ~x_ctrl; } -- cgit v1.2.3