From 0471a7bd1bca2a47a5f378f2222c5cf39ce94152 Mon Sep 17 00:00:00 2001 From: Lev Kujawski Date: Sat, 21 May 2022 08:15:11 +0000 Subject: KVM: set_msr_mce: Permit guests to ignore single-bit ECC errors Certain guest operating systems (e.g., UNIXWARE) clear bit 0 of MC1_CTL to ignore single-bit ECC data errors. Single-bit ECC data errors are always correctable and thus are safe to ignore because they are informational in nature rather than signaling a loss of data integrity. Prior to this patch, these guests would crash upon writing MC1_CTL, with resultant error messages like the following: error: kvm run failed Operation not permitted EAX=fffffffe EBX=fffffffe ECX=00000404 EDX=ffffffff ESI=ffffffff EDI=00000001 EBP=fffdaba4 ESP=fffdab20 EIP=c01333a5 EFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] CS =0100 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA] SS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] DS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] FS =0000 00000000 ffffffff 00c00000 GS =0000 00000000 ffffffff 00c00000 LDT=0118 c1026390 00000047 00008200 DPL=0 LDT TR =0110 ffff5af0 00000067 00008b00 DPL=0 TSS32-busy GDT= ffff5020 000002cf IDT= ffff52f0 000007ff CR0=8001003b CR2=00000000 CR3=0100a000 CR4=00000230 DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000 DR6=ffff0ff0 DR7=00000400 EFER=0000000000000000 Code=08 89 01 89 51 04 c3 8b 4c 24 08 8b 01 8b 51 04 8b 4c 24 04 <0f> 30 c3 f7 05 a4 6d ff ff 10 00 00 00 74 03 0f 31 c3 33 c0 33 d2 c3 8d 74 26 00 0f 31 c3 Signed-off-by: Lev Kujawski Message-Id: <20220521081511.187388-1-lkujaw@member.fsf.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b81ef4f497f4..b5aeed18b9f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3234,10 +3234,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest + * this to avoid an uncatched #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore + * correctable, single-bit ECC data errors. */ if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return -1; /* MCi_STATUS */ -- cgit v1.2.3 From a61d7c5432ac5a953bbcec17af031661c2bd201d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:31 +0200 Subject: KVM: x86: Trace re-injected exceptions Trace exceptions that are re-injected, not just those that KVM is injecting for the first time. Debugging re-injection bugs is painful enough as is, not having visibility into what KVM is doing only makes things worse. Delay propagating pending=>injected in the non-reinjection path so that the tracing can properly identify reinjected exceptions. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Maciej S. Szmigiero Message-Id: <25470690a38b4d2b32b6204875dd35676c65c9f2.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 921b1139c303..c9f3ad89bf4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9408,6 +9408,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9465,13 +9470,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9485,6 +9483,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } -- cgit v1.2.3 From 2d61391270a3ceb95b3dd536ea13002e653323b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 00:07:33 +0200 Subject: KVM: x86: Differentiate Soft vs. Hard IRQs vs. reinjected in tracepoint In the IRQ injection tracepoint, differentiate between Hard IRQs and Soft "IRQs", i.e. interrupts that are reinjected after incomplete delivery of a software interrupt from an INTn instruction. Tag reinjected interrupts as such, even though the information is usually redundant since soft interrupts are only ever reinjected by KVM. Though rare in practice, a hard IRQ can be reinjected. Signed-off-by: Sean Christopherson [MSS: change "kvm_inj_virq" event "reinjected" field type to bool] Signed-off-by: Maciej S. Szmigiero Message-Id: <9664d49b3bd21e227caa501cff77b0569bebffe2.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9f3ad89bf4c..501606e02688 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9448,7 +9448,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9539,7 +9539,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) -- cgit v1.2.3 From 1d5e740d518e02cea46325b3d37135bf9c08982a Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:09 +0800 Subject: KVM: Move kvm_arch_vcpu_precreate() under kvm->lock kvm_arch_vcpu_precreate() targets to handle arch specific VM resource to be prepared prior to the actual creation of vCPU. For example, x86 platform may need do per-VM allocation based on max_vcpu_ids at the first vCPU creation. It probably leads to concurrency control on this allocation as multiple vCPU creation could happen simultaneously. From the architectual point of view, it's necessary to execute kvm_arch_vcpu_precreate() under protect of kvm->lock. Currently only arm64, x86 and s390 have non-nop implementations at the stage of vCPU pre-creation. Remove the lock acquiring in s390's design and make sure all architecture can run kvm_arch_vcpu_precreate() safely under kvm->lock without recrusive lock issue. Suggested-by: Sean Christopherson Signed-off-by: Zeng Guang Message-Id: <20220419154409.11842-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 501606e02688..9c2e2b6d1767 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11242,7 +11242,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); -- cgit v1.2.3 From 35875316384b71d23dc2a45a969732fc8cab16af Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:44 +0800 Subject: KVM: x86: Allow userspace to set maximum VCPU id for VM Introduce new max_vcpu_ids in KVM for x86 architecture. Userspace can assign maximum possible vcpu id for current VM session using KVM_CAP_MAX_VCPU_ID of KVM_ENABLE_CAP ioctl(). This is done for x86 only because the sole use case is to guide memory allocation for PID-pointer table, a structure needed to enable VMX IPI. By default, max_vcpu_ids set as KVM_MAX_VCPU_IDS. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Zeng Guang Message-Id: <20220419154444.11888-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c2e2b6d1767..25fbb90c7c93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6087,6 +6087,20 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -11246,6 +11260,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + return 0; } -- cgit v1.2.3 From d588bb9be1da6aa750aa64875fe57369db983d8b Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 19 Apr 2022 23:45:10 +0800 Subject: KVM: VMX: enable IPI virtualization With IPI virtualization enabled, the processor emulates writes to APIC registers that would send IPIs. The processor sets the bit corresponding to the vector in target vCPU's PIR and may send a notification (IPI) specified by NDST and NV fields in target vCPU's Posted-Interrupt Descriptor (PID). It is similar to what IOMMU engine does when dealing with posted interrupt from devices. A PID-pointer table is used by the processor to locate the PID of a vCPU with the vCPU's APIC ID. The table size depends on maximum APIC ID assigned for current VM session from userspace. Allocating memory for PID-pointer table is deferred to vCPU creation, because irqchip mode and VM-scope maximum APIC ID is settled at that point. KVM can skip PID-pointer table allocation if !irqchip_in_kernel(). Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its notification vector to wakeup vector. This can ensure that when an IPI for blocked vCPUs arrives, VMM can get control and wake up blocked vCPUs. And if a VCPU is preempted, its posted interrupt notification is suppressed. Note that IPI virtualization can only virualize physical-addressing, flat mode, unicast IPIs. Sending other IPIs would still cause a trap-like APIC-write VM-exit and need to be handled by VMM. Signed-off-by: Chao Gao Signed-off-by: Zeng Guang Message-Id: <20220419154510.11938-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25fbb90c7c93..37fb301f52af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11266,7 +11266,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return 0; + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From bef6ecca46ac938ffb352d7fa2f6eafd1b6a41be Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:33 +0800 Subject: KVM: x86/pmu: Set MSR_IA32_MISC_ENABLE_EMON bit when vPMU is enabled On Intel platforms, the software can use the IA32_MISC_ENABLE[7] bit to detect whether the processor supports performance monitoring facility. It depends on the PMU is enabled for the guest, and a software write operation to this available bit will be ignored. The proposal to ignore the toggle in KVM is the way to go and that behavior matches bare metal. Signed-off-by: Like Xu Message-Id: <20220411101946.20262-5-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 37fb301f52af..68ec5cbeb665 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3558,9 +3558,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON; + + /* + * For a dummy user space, the order of setting vPMU capabilities and + * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to + * avoid inconsistent functionality we keep the vPMU bits unchanged here. + */ + data &= ~pmu_mask; + data |= old_val & pmu_mask; if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3569,6 +3579,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; -- cgit v1.2.3 From c59a1f106f5cd4843c097069ff1bb2ad72103a67 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:36 +0800 Subject: KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the IA32_PEBS_ENABLE MSR exists and all architecturally enumerated fixed and general-purpose counters have corresponding bits in IA32_PEBS_ENABLE that enable generation of PEBS records. The general-purpose counter bits start at bit IA32_PEBS_ENABLE[0], and the fixed counter bits start at bit IA32_PEBS_ENABLE[32]. When guest PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and atomically switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. Based on whether the platform supports x86_pmu.pebs_ept, it has also refactored the way to add more msrs to arr[] in intel_guest_get_msrs() for extensibility. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-8-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 68ec5cbeb665..12183c790ed1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,6 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From 8183a538cd95f72f11871b35726256ec3bcb9439 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:39 +0800 Subject: KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS When CPUID.01H:EDX.DS[21] is set, the IA32_DS_AREA MSR exists and points to the linear address of the first byte of the DS buffer management area, which is used to manage the PEBS records. When guest PEBS is enabled, the MSR_IA32_DS_AREA MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. The WRMSR to IA32_DS_AREA MSR brings a #GP(0) if the source register contains a non-canonical address. Originally-by: Andi Kleen Co-developed-by: Kan Liang Signed-off-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra (Intel) Message-Id: <20220411101946.20262-11-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12183c790ed1..ead86072612d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,7 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_IA32_PEBS_ENABLE, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From 902caeb6841a64072791b1c18f9f56089566865d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:40 +0800 Subject: KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If IA32_PERF_CAPABILITIES.PEBS_BASELINE [bit 14] is set, the adaptive PEBS is supported. The PEBS_DATA_CFG MSR and adaptive record enable bits (IA32_PERFEVTSELx.Adaptive_Record and IA32_FIXED_CTR_CTRL. FCx_Adaptive_Record) are also supported. Adaptive PEBS provides software the capability to configure the PEBS records to capture only the data of interest, keeping the record size compact. An overflow of PMCx results in generation of an adaptive PEBS record with state information based on the selections specified in MSR_PEBS_DATA_CFG.By default, the record only contain the Basic group. When guest adaptive PEBS is enabled, the IA32_PEBS_ENABLE MSR will be added to the perf_guest_switch_msr() and switched during the VMX transitions just like CORE_PERF_GLOBAL_CTRL MSR. According to Intel SDM, software is recommended to PEBS Baseline when the following is true. IA32_PERF_CAPABILITIES.PEBS_BASELINE[14] && IA32_PERF_CAPABILITIES.PEBS_FMT[11:8] ≥ 4. Co-developed-by: Luwei Kang Signed-off-by: Luwei Kang Signed-off-by: Like Xu Message-Id: <20220411101946.20262-12-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ead86072612d..2d9456b4874b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1448,7 +1448,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, -- cgit v1.2.3 From d10551738f6adcc3e6040fc846b171e72e94f0e9 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:41 +0800 Subject: KVM: x86: Set PEBS_UNAVAIL in IA32_MISC_ENABLE when PEBS is enabled The bit 12 represents "Processor Event Based Sampling Unavailable (RO)" : 1 = PEBS is not supported. 0 = PEBS is supported. A write to this PEBS_UNAVL available bit will bring #GP(0) when guest PEBS is enabled. Some PEBS drivers in guest may care about this bit. Signed-off-by: Like Xu Message-Id: <20220411101946.20262-13-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d9456b4874b..94b92381cc8b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3561,7 +3561,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + + /* RO bits */ + if (!msr_info->host_initiated && + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) + return 1; /* * For a dummy user space, the order of setting vPMU capabilities and -- cgit v1.2.3 From 968635abd5f5986f3cb6f15602d365cf1b551c5d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 11 Apr 2022 18:19:44 +0800 Subject: KVM: x86/pmu: Add kvm_pmu_cap to optimize perf_get_x86_pmu_capability The information obtained from the interface perf_get_x86_pmu_capability() doesn't change, so an exported "struct x86_pmu_capability" is introduced for all guests in the KVM, and it's initialized before hardware_setup(). Signed-off-by: Like Xu Message-Id: <20220411101946.20262-16-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94b92381cc8b..32218fac4504 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6667,15 +6667,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6727,12 +6724,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -11721,6 +11718,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; -- cgit v1.2.3 From d1c88a4020567ba4da52f778bcd9619d87e4ea75 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 25 May 2022 04:39:22 -0400 Subject: KVM: x86: always allow host-initiated writes to PMU MSRs Whenever an MSR is part of KVM_GET_MSR_INDEX_LIST, it has to be always retrievable and settable with KVM_GET_MSR and KVM_SET_MSR. Accept the PMU MSRs unconditionally in intel_is_valid_msr, if the access was host-initiated. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32218fac4504..c8dfdef9e52f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3719,7 +3719,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3802,7 +3802,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3882,7 +3882,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3892,7 +3892,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4138,7 +4138,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } -- cgit v1.2.3 From ed2351174e38ad4febbbc0dba802803e6cff8ae0 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 24 May 2022 21:56:21 +0800 Subject: KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault For the triple fault sythesized by KVM, e.g. the RSM path or nested_vmx_abort(), if KVM exits to userspace before the request is serviced, userspace could migrate the VM and lose the triple fault. Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault with a new event KVM_VCPUEVENT_VALID_FAULT_FAULT so that userspace can save and restore the triple fault event. This extension is guarded by a new KVM capability KVM_CAP_TRIPLE_FAULT_EVENT. Note that in the set_vcpu_events path, userspace is able to set/clear the triple fault request through triple_fault.pending field. Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-2-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8dfdef9e52f..422fbb0d7518 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4296,6 +4296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4942,6 +4943,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4955,7 +4960,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5028,6 +5034,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -6029,6 +6044,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; -- cgit v1.2.3 From 938c8745bcf2f732ee928a0b9bd592198a88cfa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 May 2022 21:56:23 +0800 Subject: KVM: x86: Introduce "struct kvm_caps" to track misc caps/settings Add kvm_caps to hold a variety of capabilites and defaults that aren't handled by kvm_cpu_caps because they aren't CPUID bits in order to reduce the amount of boilerplate code required to add a new feature. The vast majority (all?) of the caps interact with vendor code and are written only during initialization, i.e. should be tagged __read_mostly, declared extern in x86.h, and exported. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220524135624.22988-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 97 +++++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 55 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 422fbb0d7518..53e5f2ad2422 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,8 +87,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -309,8 +297,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -2345,12 +2331,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2362,10 +2348,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2383,7 +2369,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2460,18 +2446,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2498,11 +2484,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2511,9 +2497,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2555,7 +2541,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2691,7 +2677,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3104,7 +3090,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3613,7 +3599,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -4374,7 +4360,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4396,7 +4382,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4405,7 +4391,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; @@ -4443,7 +4429,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4520,8 +4506,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4805,7 +4791,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -5111,7 +5097,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5616,8 +5603,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6061,7 +6048,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6610,8 +6597,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -8774,7 +8761,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -9036,7 +9023,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -11748,13 +11735,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11762,10 +11749,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } -- cgit v1.2.3 From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 24 May 2022 21:56:24 +0800 Subject: KVM: VMX: Enable Notify VM exit There are cases that malicious virtual machines can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. VMM can enable notify VM exit that a VM exit generated if no event window occurs in VM non-root mode for a specified amount of time (notify window). Feature enabling: - The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust the expected notify window. - Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space can query and enable this feature in per-VM scope. The argument is a 64bit value: bits 63:32 are used for notify window, and bits 31:0 are for flags. Current supported flags: - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify window provided. - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen. - It's safe to even set notify window to zero since an internal hardware threshold is added to vmcs.notify_window. VM exit handling: - Introduce a vcpu state notify_window_exits to records the count of notify VM exits and expose it through the debugfs. - Notify VM exit can happen incident to delivery of a vector event. Allow it in KVM. - Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID bit is set. Nested handling - Nested notify VM exits are not supported yet. Keep the same notify window control in vmcs02 as vmcs01, so that L1 can't escape the restriction of notify VM exits through launching L2 VM. Notify VM exit is defined in latest Intel Architecture Instruction Set Extensions Programming Reference, chapter 9.2. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Tao Xu Co-developed-by: Chenyi Qiang Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53e5f2ad2422..a8014233fd57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -284,7 +284,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_ICOUNTER(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -4402,6 +4403,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -6125,6 +6129,22 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; -- cgit v1.2.3 From b9181c8ef35636152facc72f801f27b4264df8c0 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 1 Jun 2022 11:19:25 +0800 Subject: KVM: x86/pmu: Avoid exposing Intel BTS feature The BTS feature (including the ability to set the BTS and BTINT bits in the DEBUGCTL MSR) is currently unsupported on KVM. But we may try using the BTS facility on a PEBS enabled guest like this: perf record -e branches:u -c 1 -d ls and then we would encounter the following call trace: [] unchecked MSR access error: WRMSR to 0x1d9 (tried to write 0x00000000000003c0) at rIP: 0xffffffff810745e4 (native_write_msr+0x4/0x20) [] Call Trace: [] intel_pmu_enable_bts+0x5d/0x70 [] bts_event_add+0x54/0x70 [] event_sched_in+0xee/0x290 As it lacks any CPUID indicator or perf_capabilities valid bit fields to prompt for this information, the platform would hint the Intel BTS feature unavailable to guest by setting the BTS_UNAVAIL bit in the IA32_MISC_ENABLE. Signed-off-by: Like Xu Message-Id: <20220601031925.59693-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a8014233fd57..79efdc19b4c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3548,12 +3548,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_EMON | - MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; + u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK | + MSR_IA32_MISC_ENABLE_EMON; /* RO bits */ if (!msr_info->host_initiated && - ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)) return 1; /* -- cgit v1.2.3 From 1cca2f8c501fa01e6c0d2aefbf97f8c3c89c0186 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 26 May 2022 21:08:15 +0000 Subject: KVM: x86: Bug the VM if the emulator accesses a non-existent GPR Bug the VM, i.e. kill it, if the emulator accesses a non-existent GPR, i.e. generates an out-of-bounds GPR index. Continuing on all but gaurantees some form of data corruption in the guest, e.g. even if KVM were to redirect to a dummy register, KVM would be incorrectly read zeros and drop writes. Note, bugging the VM doesn't completely prevent data corruption, e.g. the current round of emulation will complete before the vCPU bails out to userspace. But, the very act of killing the guest can also cause data corruption, e.g. due to lack of file writeback before termination, so taking on additional complexity to cleanly bail out of the emulator isn't justified, the goal is purely to stem the bleeding and alert userspace that something has gone horribly wrong, i.e. to avoid _silent_ data corruption. Signed-off-by: Sean Christopherson Reviewed-by: Kees Cook Reviewed-by: Vitaly Kuznetsov Message-Id: <20220526210817.3428868-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8177d8b378e..a0baf49bb00d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7915,7 +7915,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, -- cgit v1.2.3 From ae801e1303e939ad5ebd9f390bdcc57275ada33b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:46 +0000 Subject: KVM: x86: Check for in-kernel xAPIC when querying APICv for directed yield Use kvm_vcpu_apicv_active() to check if APICv is active when seeing if a vCPU is a candidate for directed yield due to a pending ACPIv interrupt. This will allow moving apicv_active into kvm_lapic without introducing a potential NULL pointer deref (kvm_vcpu_apicv_active() effectively adds a pre-check on the vCPU having an in-kernel APIC). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0baf49bb00d..db35d40bf996 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12395,7 +12395,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; -- cgit v1.2.3 From ce0a58f4756c14d7646cfdf279dbaada9d7712a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 23:05:47 +0000 Subject: KVM: x86: Move "apicv_active" into "struct kvm_lapic" Move the per-vCPU apicv_active flag into KVM's local APIC instance. APICv is fully dependent on an in-kernel local APIC, but that's not at all clear when reading the current code due to the flag being stored in the generic kvm_vcpu_arch struct. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220614230548.3852141-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db35d40bf996..00e23dc518e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9460,7 +9460,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9913,6 +9913,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9923,10 +9924,10 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) activate = kvm_vcpu_apicv_activated(vcpu); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9936,7 +9937,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -11379,7 +11380,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else -- cgit v1.2.3 From 9fc222967a39d6be96dabb942cc94e4f07ca049c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:49 +0000 Subject: KVM: x86: Give host userspace full control of MSR_IA32_MISC_ENABLES Give userspace full control of the read-only bits in MISC_ENABLES, i.e. do not modify bits on PMU refresh and do not preserve existing bits when userspace writes MISC_ENABLES. With a few exceptions where KVM doesn't expose the necessary controls to userspace _and_ there is a clear cut association with CPUID, e.g. reserved CR4 bits, KVM does not own the vCPU and should not manipulate the vCPU model on behalf of "dummy user space". The argument that KVM is doing userspace a favor because "the order of setting vPMU capabilities and MSR_IA32_MISC_ENABLE is not strictly guaranteed" is specious, as attempting to configure MSRs on behalf of userspace inevitably leads to edge cases precisely because KVM does not prescribe a specific order of initialization. Example #1: intel_pmu_refresh() consumes and modifies the vCPU's MSR_IA32_PERF_CAPABILITIES, and so assumes userspace initializes config MSRs before setting the guest CPUID model. If userspace sets CPUID first, then KVM will mark PEBS as available when arch.perf_capabilities is initialized with a non-zero PEBS format, thus creating a bad vCPU model if userspace later disables PEBS by writing PERF_CAPABILITIES. Example #2: intel_pmu_refresh() does not clear PERF_CAP_PEBS_MASK in MSR_IA32_PERF_CAPABILITIES if there is no vPMU, making KVM inconsistent in its desire to be consistent. Example #3: intel_pmu_refresh() does not clear MSR_IA32_MISC_ENABLE_EMON if KVM_SET_CPUID2 is called multiple times, first with a vPMU, then without a vPMU. While slightly contrived, it's plausible a VMM could reflect KVM's default vCPU and then operate on KVM's copy of CPUID to later clear the vPMU settings, e.g. see KVM's selftests. Example #4: Enumerating an Intel vCPU on an AMD host will not call into intel_pmu_refresh() at any point, and so the BTS and PEBS "unavailable" bits will be left clear, without any way for userspace to set them. Keep the "R" behavior of the bit 7, "EMON available", for the guest. Unlike the BTS and PEBS bits, which are fully "RO", the EMON bit can be written with a different value, but that new value is ignored. Cc: Like Xu Signed-off-by: Sean Christopherson Reported-by: kernel test robot Message-Id: <20220611005755.753273-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00e23dc518e0..67bdd7e20534 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3550,21 +3550,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; - u64 pmu_mask = MSR_IA32_MISC_ENABLE_PMU_RO_MASK | - MSR_IA32_MISC_ENABLE_EMON; - /* RO bits */ - if (!msr_info->host_initiated && - ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)) - return 1; + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } - /* - * For a dummy user space, the order of setting vPMU capabilities and - * initialising MSR_IA32_MISC_ENABLE is not strictly guaranteed, so to - * avoid inconsistent functionality we keep the vPMU bits unchanged here. - */ - data &= ~pmu_mask; - data |= old_val & pmu_mask; if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) @@ -11573,6 +11569,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); -- cgit v1.2.3 From 545feb96c052809dab5ec04b95f976acca9f9364 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:52 +0000 Subject: Revert "KVM: x86: always allow host-initiated writes to PMU MSRs" Revert the hack to allow host-initiated accesses to all "PMU" MSRs, as intel_is_valid_msr() returns true for _all_ MSRs, regardless of whether or not it has a snowball's chance in hell of actually being a PMU MSR. That mostly gets papered over by the actual get/set helpers only handling MSRs that they knows about, except there's the minor detail that kvm_pmu_{g,s}et_msr() eat reads and writes when the PMU is disabled. I.e. KVM will happy allow reads and writes to _any_ MSR if the PMU is disabled, either via module param or capability. This reverts commit d1c88a4020567ba4da52f778bcd9619d87e4ea75. Fixes: d1c88a402056 ("KVM: x86: always allow host-initiated writes to PMU MSRs") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67bdd7e20534..46301b148f64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3704,7 +3704,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -3787,7 +3787,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } @@ -3867,7 +3867,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; @@ -3877,7 +3877,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; @@ -4123,7 +4123,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; #endif default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index, msr_info->host_initiated)) + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } -- cgit v1.2.3 From 157fc497b54fd1dfcdedbca6199adca4bf5ee6ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:54 +0000 Subject: KVM: x86: Ignore benign host accesses to "unsupported" PEBS and BTS MSRs Ignore host userspace reads and writes of '0' to PEBS and BTS MSRs that KVM reports in the MSR-to-save list, but the MSRs are ultimately unsupported. All MSRs in said list must be writable by userspace, e.g. if userspace sends the list back at KVM without filtering out the MSRs it doesn't need. Fixes: 8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS") Fixes: 902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS") Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46301b148f64..e07c5765a551 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3786,6 +3786,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3866,9 +3876,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports in + * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; -- cgit v1.2.3 From ff81a90f45ce6b818167c590f7625b3b573defc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Jun 2022 00:57:55 +0000 Subject: KVM: x86: Ignore benign host writes to "unsupported" F15H_PERF_CTL MSRs Ignore host userspace writes of '0' to F15H_PERF_CTL MSRs KVM reports in the MSR-to-save list, but the MSRs are ultimately unsupported. All MSRs in said list must be writable by userspace, e.g. if userspace sends the list back at KVM without filtering out the MSRs it doesn't need. Note, reads of said MSRs already have the desired behavior. Signed-off-by: Sean Christopherson Message-Id: <20220611005755.753273-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e07c5765a551..b70fe8560881 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3789,6 +3789,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PEBS_ENABLE: case MSR_IA32_DS_AREA: case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); /* -- cgit v1.2.3 From bfbcc81bb82cbbad8bf4e40cea274f42b50674e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:12 +0000 Subject: KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior Add a quirk for KVM's behavior of emulating intercepted MONITOR/MWAIT instructions a NOPs regardless of whether or not they are supported in guest CPUID. KVM's current behavior was likely motiviated by a certain fruity operating system that expects MONITOR/MWAIT to be supported unconditionally and blindly executes MONITOR/MWAIT without first checking CPUID. And because KVM does NOT advertise MONITOR/MWAIT to userspace, that's effectively the default setup for any VMM that regurgitates KVM_GET_SUPPORTED_CPUID to KVM_SET_CPUID2. Note, this quirk interacts with KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT. The behavior is actually desirable, as userspace VMMs that want to unconditionally hide MONITOR/MWAIT from the guest can leave the MISC_ENABLE quirk enabled. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b70fe8560881..9b9f9f65c548 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2036,13 +2036,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2050,11 +2043,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -3884,8 +3892,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); /* - * Userspace is allowed to read MSRs that KVM reports in - * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. */ if (!msr_info->host_initiated) return 1; -- cgit v1.2.3 From 1c4dc57328bf218e999951824dce75c6125c4f3c Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:20 +0000 Subject: KVM: x86: Fix errant brace in KVM capability handling The braces around the KVM_CAP_XSAVE2 block also surround the KVM_CAP_PMU_CAPABILITY block, likely the result of a merge issue. Simply move the curly brace back to where it belongs. Fixes: ba7bb663f5547 ("KVM: x86: Provide per VM capability for disabling PMU virtualization") Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-8-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b9f9f65c548..c1b3b2ea8ee0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4420,10 +4420,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; -- cgit v1.2.3 From 084cc29f8bbb034cf30a7ee07a816c115e0c28df Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:21 +0000 Subject: KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis In some cases, the NX hugepage mitigation for iTLB multihit is not needed for all guests on a host. Allow disabling the mitigation on a per-VM basis to avoid the performance hit of NX hugepages on trusted workloads. In order to disable NX hugepages on a VM, ensure that the userspace actor has permission to reboot the system. Since disabling NX hugepages would allow a guest to crash the system, it is similar to reboot permissions. Ideally, KVM would require userspace to prove it has access to KVM's nx_huge_pages module param, e.g. so that userspace can opt out without needing full reboot permissions. But getting access to the module param file info is difficult because it is buried in layers of sysfs and module glue. Requiring CAP_SYS_BOOT is sufficient for all known use cases. Suggested-by: Jim Mattson Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-9-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b3b2ea8ee0..7ce0c6fe166d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -6184,6 +6185,35 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; -- cgit v1.2.3 From 4b903561ec499eef233bf690076cd71b1f8604cf Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:30 -0700 Subject: KVM: x86: Add Corrected Machine Check Interrupt (CMCI) emulation to lapic. This patch calculates the number of lvt entries as part of KVM_X86_MCE_SETUP conditioned on the presence of MCG_CMCI_P bit in MCG_CAP and stores result in kvm_lapic. It translats from APIC_LVTx register to index in lapic_lvt_entry enum. It extends the APIC_LVTx macro as well as other lapic write/reset handling etc to support Corrected Machine Check Interrupt. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-5-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce0c6fe166d..6d6d6c6413ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4845,6 +4845,8 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + vcpu->arch.apic->nr_lvt_entries = + KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); static_call(kvm_x86_setup_mce)(vcpu); out: -- cgit v1.2.3 From 087acc4e1847993a66f43128e40be0ab2244c98f Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:31 -0700 Subject: KVM: x86: Use kcalloc to allocate the mce_banks array. This patch updates the allocation of mce_banks with the array allocation API (kcalloc) as a precedent for the later mci_ctl2_banks to implement per-bank control of Corrected Machine Check Interrupt (CMCI). Suggested-by: Sean Christopherson Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-6-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d6d6c6413ba..3c6eb6837b96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11447,7 +11447,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) goto fail_free_pio_data; -- cgit v1.2.3 From 281b52780b57821abc434c0413107d3075881a1f Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:32 -0700 Subject: KVM: x86: Add emulation for MSR_IA32_MCx_CTL2 MSRs. This patch adds the emulation of IA32_MCi_CTL2 registers to KVM. A separate mci_ctl2_banks array is used to keep the existing mce_banks register layout intact. In Machine Check Architecture, in addition to MCG_CMCI_P, bit 30 of the per-bank register IA32_MCi_CTL2 controls whether Corrected Machine Check error reporting is enabled. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-7-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 130 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 91 insertions(+), 39 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c6eb6837b96..f743e4ef2009 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3209,6 +3219,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3222,35 +3233,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest. + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; + + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. * * UNIXWARE clears bit 0 of MC1_CTL to ignore * correctable, single-bit ECC data errors. - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3534,7 +3563,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3704,6 +3734,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3819,6 +3850,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3836,16 +3868,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3949,7 +3992,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4065,6 +4109,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4842,9 +4887,12 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } vcpu->arch.apic->nr_lvt_entries = KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); @@ -11449,7 +11497,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11503,6 +11553,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11548,6 +11599,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); -- cgit v1.2.3 From aebc3ca19063d68b76bcaaca81558d4f180c61b0 Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:33 -0700 Subject: KVM: x86: Enable CMCI capability by default and handle injected UCNA errors This patch enables MCG_CMCI_P by default in kvm_mce_cap_supported. It reuses ioctl KVM_X86_SET_MCE to implement injection of UnCorrectable No Action required (UCNA) errors, signaled via Corrected Machine Check Interrupt (CMCI). Neither of the CMCI and UCNA emulations depends on hardware. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-8-juew@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f743e4ef2009..031678eff28e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4901,6 +4901,42 @@ out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4910,6 +4946,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4917,7 +4959,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank -- cgit v1.2.3 From 8fc9c7a3079e2d2e72dfb2f4e9bb2b1fc67f0ee6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:00 -0500 Subject: KVM: x86: Deactivate APICv on vCPU with APIC disabled APICv should be deactivated on vCPU that has APIC disabled. Therefore, call kvm_vcpu_update_apicv() when changing APIC mode, and add additional check for APIC disable mode when determine APICV activation, Suggested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-9-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 031678eff28e..47981e97d686 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10065,7 +10065,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); if (apic->apicv_active == activate) goto out; -- cgit v1.2.3 From f8d8ac215919cf40d3b358846ed8e8e3a3683131 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:06 -0500 Subject: KVM: x86: Warning APICv inconsistency only when vcpu APIC mode is valid When launching a VM with x2APIC and specify more than 255 vCPUs, the guest kernel can disable x2APIC (e.g. specify nox2apic kernel option). The VM fallbacks to xAPIC mode, and disable the vCPU ID 255 and greater. In this case, APICV is deactivated for the disabled vCPUs. However, the current APICv consistency warning does not account for this case, which results in a warning. Therefore, modify warning logic to report only when vCPU APIC mode is valid. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-15-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47981e97d686..acd7ce1dcb9f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10477,7 +10477,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) -- cgit v1.2.3 From 39b6b8c35cf37970e07e9081c0e7d1083930b2f7 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 19 May 2022 05:27:08 -0500 Subject: KVM: SVM: Add AVIC doorbell tracepoint Add a tracepoint to track number of doorbells being sent to signal a running vCPU to process IRQ after being injected. Reviewed-by: Maxim Levitsky Signed-off-by: Suravee Suthikulpanit Message-Id: <20220519102709.24125-17-suravee.suthikulpanit@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index acd7ce1dcb9f..d88600e41ff8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13328,6 +13328,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); -- cgit v1.2.3 From 7a6177d6f344941410faa51f3e46ef242c54b406 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 13:32:29 -0400 Subject: KVM: x86: complete fast IN directly with complete_emulator_pio_in() Use complete_emulator_pio_in() directly when completing fast PIO, there's no need to bounce through emulator_pio_in(): the comment about ECX changing doesn't apply to fast PIO, which isn't used for string I/O. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d88600e41ff8..d66a873f4427 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8871,11 +8871,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 0f87ac234d98c68e4b2bfde6a8bcdeef3d3f54b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 06:50:06 -0400 Subject: KVM: x86: inline kernel_pio into its sole caller The caller of kernel_pio already has arguments for most of what kernel_pio fishes out of vcpu->arch.pio. This is the first step towards ensuring that vcpu->arch.pio.* is only used when exiting to userspace. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d66a873f4427..524a96d26399 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7575,37 +7575,31 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - int r = 0, i; - - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) - break; - pd += vcpu->arch.pio.size; - } - return r; -} - static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, unsigned int count, bool in) { + void *data = vcpu->arch.pio_data; + unsigned i; + int r; + vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + if (r) + goto userspace_io; + data += size; + } + return 1; +userspace_io: vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; -- cgit v1.2.3 From 35ab3b77a0ae76246dd2666d4f8190c824392fb4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 11:05:06 -0400 Subject: KVM: x86: drop PIO from unregistered devices KVM protects the device list with SRCU, and therefore different calls to kvm_io_bus_read()/kvm_io_bus_write() can very well see different incarnations of kvm->buses. If userspace unregisters a device while vCPUs are running there is no well-defined result. This patch applies a safe fallback by returning early from emulator_pio_in_out(). This corresponds to returning zeroes from IN, and dropping the writes on the floor for OUT. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 524a96d26399..5a56d39bd81f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7593,8 +7593,19 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); - if (r) - goto userspace_io; + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0 (the buffer + * was zeroed in __emulator_pio_in). + */ + break; + } + data += size; } return 1; @@ -7606,7 +7617,6 @@ userspace_io: vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -- cgit v1.2.3 From 30d583fd4e1ecafa8642f9f74e102876b4aeb733 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:07:19 -0400 Subject: KVM: x86: move all vcpu->arch.pio* setup in emulator_pio_in_out() For now, this is basically an excuse to add back the void* argument to the function, while removing some knowledge of vcpu->arch.pio* from its callers. The WARN that vcpu->arch.pio.count is zero is also extended to OUT operations. The vcpu->arch.pio* fields still need to be filled even when the PIO is handled in-kernel as __emulator_pio_in() is always followed by complete_emulator_pio_in(). But after fixing that, it will be possible to to only populate the vcpu->arch.pio* fields on userspace exits. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a56d39bd81f..368d0d4d56ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7576,17 +7576,25 @@ emul_write: } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, + unsigned short port, void *data, unsigned int count, bool in) { - void *data = vcpu->arch.pio_data; unsigned i; int r; + WARN_ON_ONCE(vcpu->arch.pio.count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; + if (in) { + /* The buffer is only used in complete_emulator_pio_in(). */ + WARN_ON(data); + memset(vcpu->arch.pio_data, 0, size * count); + } else { + memcpy(vcpu->arch.pio_data, data, size * count); + } + data = vcpu->arch.pio_data; for (i = 0; i < count; i++) { if (in) @@ -7623,9 +7631,7 @@ userspace_io: static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + return emulator_pio_in_out(vcpu, size, port, NULL, count, true); } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) @@ -7674,9 +7680,8 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, { int ret; - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + ret = emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); if (ret) vcpu->arch.pio.count = 0; -- cgit v1.2.3 From 0c05e10bce5217657bcdaa9787c9ea94e6b384b2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2022 10:24:01 -0400 Subject: KVM: x86: wean in-kernel PIO from vcpu->arch.pio* Make emulator_pio_in_out operate directly on the provided buffer as long as PIO is handled inside KVM. For input operations, this means that, in the case of in-kernel PIO, __emulator_pio_in() does not have to be always followed by complete_emulator_pio_in(). This affects emulator_pio_in() and kvm_sev_es_ins(); for the latter, that is why the call moves from advance_sev_es_emulated_ins() to complete_sev_es_emulated_ins(). For output, it means that vcpu->pio.count is never set unnecessarily and there is no need to clear it; but also vcpu->pio.size must not be used in kvm_sev_es_outs(), because it will not be updated for in-kernel OUT. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 73 ++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 368d0d4d56ff..716ae5c92687 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7583,19 +7583,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, int r; WARN_ON_ONCE(vcpu->arch.pio.count); - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - if (in) { - /* The buffer is only used in complete_emulator_pio_in(). */ - WARN_ON(data); - memset(vcpu->arch.pio_data, 0, size * count); - } else { - memcpy(vcpu->arch.pio_data, data, size * count); - } - data = vcpu->arch.pio_data; - for (i = 0; i < count; i++) { if (in) r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); @@ -7608,9 +7595,10 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, /* * Userspace must have unregistered the device while PIO - * was running. Drop writes / read as 0 (the buffer - * was zeroed in __emulator_pio_in). + * was running. Drop writes / read as 0. */ + if (in) + memset(data, 0, size * (count - i)); break; } @@ -7619,6 +7607,16 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 1; userspace_io: + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; @@ -7629,15 +7627,19 @@ userspace_io: } static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) + unsigned short port, void *val, unsigned int count) { - return emulator_pio_in_out(vcpu, size, port, NULL, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; @@ -7654,16 +7656,11 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; + return __emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -7678,14 +7675,8 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); - ret = emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -13245,7 +13236,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, /* memcpy done already by emulator_pio_out. */ vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + vcpu->arch.sev_pio_data += count * size; if (!ret) break; @@ -13261,20 +13252,20 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu, unsigned count, int size) { - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + vcpu->arch.sev_pio_data += count * size; } static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_ins(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13286,11 +13277,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!__emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_ins(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } -- cgit v1.2.3 From dc7a4bfde507ffe1d8bef49aba1322f1d20c2cb3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:01:36 -0400 Subject: KVM: x86: wean fast IN from emulator_pio_in Use __emulator_pio_in() directly for fast PIO instead of bouncing through emulator_pio_in() now that __emulator_pio_in() fills "val" when handling in-kernel PIO. vcpu->arch.pio.count is guaranteed to be '0', so this a pure nop. emulator_pio_in_emulated is now the last caller of emulator_pio_in. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 716ae5c92687..b824ffc63b17 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8886,7 +8886,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in(vcpu, size, port, &val, 1); + ret = __emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; -- cgit v1.2.3 From f35cee4adb54af59129193d528706b390befb5f4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:19:48 -0400 Subject: KVM: x86: de-underscorify __emulator_pio_in Now all callers except emulator_pio_in_emulated are using __emulator_pio_in/complete_emulator_pio_in explicitly. Move the "either copy the result or attempt PIO" logic in emulator_pio_in_emulated, and rename __emulator_pio_in to just emulator_pio_in. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b824ffc63b17..1a017a5a680b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7626,8 +7626,8 @@ userspace_io: return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { int r = emulator_pio_in_out(vcpu, size, port, val, count, true); if (r) @@ -7645,9 +7645,11 @@ static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7660,15 +7662,7 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, return 1; } - return __emulator_pio_in(vcpu, size, port, val, count); -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, @@ -8886,7 +8880,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = __emulator_pio_in(vcpu, size, port, &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@ -13277,7 +13271,7 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ -- cgit v1.2.3 From db209369d48eab1e2c724d90c606752d398ae334 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 Oct 2021 08:47:56 -0400 Subject: KVM: SEV-ES: reuse advance_sev_es_emulated_ins for OUT too complete_emulator_pio_in() only has to be called by complete_sev_es_emulated_ins() now; therefore, all that the function does now is adjust sev_pio_count and sev_pio_data. Which is the same for both IN and OUT. No functional change intended. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a017a5a680b..567d13405445 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13206,6 +13206,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13229,8 +13235,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13246,12 +13251,6 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu, unsigned count, int size) -{ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { unsigned count = vcpu->arch.pio.count; @@ -13259,7 +13258,7 @@ static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) int port = vcpu->arch.pio.port; complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - advance_sev_es_emulated_ins(vcpu, count, size); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13275,7 +13274,7 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu, count, size); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } -- cgit v1.2.3 From 2368048bf5c2ec4b604ac3431564071e89a0bc71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:14 +0000 Subject: KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS) Return '1', not '-1', when handling an illegal WRMSR to a MCi_CTL or MCi_STATUS MSR. The behavior of "all zeros' or "all ones" for CTL MSRs is architectural, as is the "only zeros" behavior for STATUS MSRs. I.e. the intent is to inject a #GP, not exit to userspace due to an unhandled emulation case. Returning '-1' gets interpreted as -EPERM up the stack and effecitvely kills the guest. Fixes: 890ca9aefa78 ("KVM: Add MCE support") Fixes: 9ffd986c6e4e ("KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-2-seanjc@google.com --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce0c6fe166d..891ca973c838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3239,13 +3239,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return -1; + return 1; /* MCi_STATUS */ if (!msr_info->host_initiated && (offset & 0x3) == 1 && data != 0) { if (!can_set_mci_status(vcpu)) - return -1; + return 1; } vcpu->arch.mce_banks[offset] = data; -- cgit v1.2.3 From f5223a332f3647a0e3725e9b4a102e9659c84ce4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:15 +0000 Subject: KVM: x86: Use explicit case-statements for MCx banks in {g,s}et_msr_mce() Use an explicit case statement to grab the full range of MCx bank MSRs in {g,s}et_msr_mce(), and manually check only the "end" (the number of banks configured by userspace may be less than the max). The "default" trick works, but is a bit odd now, and will be quite odd if/when support for accessing MCx_CTL2 MSRs is added, which has near identical logic. Hoist "offset" to function scope so as to avoid curly braces for the case statement, and because MCx_CTL2 support will need the same variables. Opportunstically clean up the comment about allowing bit 10 to be cleared from bank 4. No functional change intended. Cc: Jue Wang Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-3-seanjc@google.com --- arch/x86/kvm/x86.c | 75 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 36 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 891ca973c838..c18d57027838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3209,6 +3209,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3222,35 +3223,36 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest. - * - * UNIXWARE clears bit 0 of MC1_CTL to ignore - * correctable, single-bit ECC data errors. - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return 1; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return 1; - } + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); - vcpu->arch.mce_banks[offset] = data; - break; - } + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* MCi_STATUS */ + if (!msr_info->host_initiated && (offset & 0x3) == 1 && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3819,6 +3821,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3836,16 +3839,16 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; -- cgit v1.2.3 From 54ad60ba9d2673293c72a7c1c4f092622a1f8789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:16 +0000 Subject: KVM: x86: Add helpers to identify CTL and STATUS MCi MSRs Add helpers to identify CTL (control) and STATUS MCi MSR types instead of open coding the checks using the offset. Using the offset is perfectly safe, but unintuitive, as understanding what the code does requires knowing that the offset calcuation will not affect the lower three bits. Opportunistically comment the STATUS logic to save readers a trip to Intel's SDM or AMD's APM to understand the "data != 0" check. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-4-seanjc@google.com --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c18d57027838..3b7c9c1b4540 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3228,9 +3238,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr > last_msr) return 1; - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, - last_msr + 1 - MSR_IA32_MC0_CTL); - /* * Only 0 or all 1s can be written to IA32_MCi_CTL, all other * values are architecturally undefined. But, some Linux @@ -3241,15 +3248,21 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, * single-bit ECC data errors. */ - if ((offset & 0x3) == 0 && + if (is_mci_control_msr(msr) && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return 1; - /* MCi_STATUS */ - if (!msr_info->host_initiated && (offset & 0x3) == 1 && + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && data != 0 && !can_set_mci_status(vcpu)) return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); vcpu->arch.mce_banks[offset] = data; break; default: -- cgit v1.2.3 From f83894b24c2a96fcecdff4858755aa79eb756465 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Jul 2022 15:48:10 -0700 Subject: KVM: x86: Fix handling of APIC LVT updates when userspace changes MCG_CAP Add a helper to update KVM's in-kernel local APIC in response to MCG_CAP being changed by userspace to fix multiple bugs. First and foremost, KVM needs to check that there's an in-kernel APIC prior to dereferencing vcpu->arch.apic. Beyond that, any "new" LVT entries need to be masked, and the APIC version register needs to be updated as it reports out the number of LVT entries. Fixes: 4b903561ec49 ("KVM: x86: Add Corrected Machine Check Interrupt (CMCI) emulation to lapic.") Reported-by: syzbot+8cdad6430c24f396f158@syzkaller.appspotmail.com Cc: Siddh Raman Pant Cc: Jue Wang Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb37d11dec2d..801c3cfd3db5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4893,8 +4893,8 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, if (mcg_cap & MCG_CMCI_P) vcpu->arch.mci_ctl2_banks[bank] = 0; } - vcpu->arch.apic->nr_lvt_entries = - KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P); + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: -- cgit v1.2.3 From 159e037d2e36d93a7b066228c6543537c25235c8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 8 Jul 2022 14:51:47 +0200 Subject: KVM: x86: Fully initialize 'struct kvm_lapic_irq' in kvm_pv_kick_cpu_op() 'vector' and 'trig_mode' fields of 'struct kvm_lapic_irq' are left uninitialized in kvm_pv_kick_cpu_op(). While these fields are normally not needed for APIC_DM_REMRD, they're still referenced by __apic_accept_irq() for trace_kvm_apic_accept_irq(). Fully initialize the structure to avoid consuming random stack memory. Fixes: a183b638b61c ("KVM: x86: make apic_accept_irq tracepoint more generic") Reported-by: syzbot+d6caa905917d353f0d07@syzkaller.appspotmail.com Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220708125147.593975-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 801c3cfd3db5..759bcc0a3300 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9340,15 +9340,17 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, */ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } -- cgit v1.2.3 From 6e1d2a3f25d518cc3d9703115c6fbec704a6c5bb Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 1 Jul 2022 17:24:13 +0800 Subject: KVM: x86/mmu: Replace UNMAPPED_GVA with INVALID_GPA for gva_to_gpa() The result of gva_to_gpa() is physical address not virtual address, it is odd that UNMAPPED_GVA macro is used as the result for physical address. Replace UNMAPPED_GVA with INVALID_GPA and drop UNMAPPED_GVA macro. No functional change intended. Signed-off-by: Hou Wenlong Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/6104978956449467d3c68f1ad7f2c2f6d771d0ee.1656667239.git.houwenlong.hwl@antgroup.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 759bcc0a3300..67dcaa670874 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -849,7 +849,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -7072,7 +7072,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -7103,7 +7103,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -7173,7 +7173,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7284,7 +7284,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7521,7 +7521,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -8338,7 +8338,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -11378,7 +11378,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -12983,7 +12983,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed -- cgit v1.2.3 From 43bb9e000ea4c62154c01844771fea25b8b83520 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:53 +0000 Subject: KVM: x86: Tweak name of MONITOR/MWAIT #UD quirk to make it #UD specific Add a "UD" clause to KVM_X86_QUIRK_MWAIT_NEVER_FAULTS to make it clear that the quirk only controls the #UD behavior of MONITOR/MWAIT. KVM doesn't currently enforce fault checks when MONITOR/MWAIT are supported, but that could change in the future. SVM also has a virtualization hole in that it checks all faults before intercepts, and so "never faults" is already a lie when running on SVM. Fixes: bfbcc81bb82c ("KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-4-seanjc@google.com --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67dcaa670874..db46c060acb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2046,7 +2046,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); -- cgit v1.2.3 From 0bc273266112937cd6560f7d447b6aa9cfd9134a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 23:27:50 +0000 Subject: KVM: x86: WARN only once if KVM leaves a dangling userspace I/O request Change a WARN_ON() to separate WARN_ON_ONCE() if KVM has an outstanding PIO or MMIO request without an associated callback, i.e. if KVM queued a userspace I/O exit but didn't actually exit to userspace before moving on to something else. Warning on every KVM_RUN risks spamming the kernel if KVM gets into a bad state. Opportunistically split the WARNs so that it's easier to triage failures when a WARN fires. Deliberately do not use KVM_BUG_ON(), i.e. don't kill the VM. While the WARN is all but guaranteed to fire if and only if there's a KVM bug, a dangling I/O request does not present a danger to KVM (that flag is truly truly consumed only in a single emulator path), and any such bug is unlikely to be fatal to the VM (KVM essentially failed to do something it shouldn't have tried to do in the first place). In other words, note the bug, but let the VM keep running. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220711232750.1092012-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db46c060acb5..0729e434c51b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10849,8 +10849,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; -- cgit v1.2.3 From 277ad7d58611b455662f2e3f7bd24ce5bfeb2fdc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 Jul 2022 02:06:45 +0200 Subject: KVM: x86: Add dedicated helper to get CPUID entry with significant index Add a second CPUID helper, kvm_find_cpuid_entry_index(), to handle KVM queries for CPUID leaves whose index _may_ be significant, and drop the index param from the existing kvm_find_cpuid_entry(). Add a WARN in the inner helper, cpuid_entry2_find(), to detect attempts to retrieve a CPUID entry whose index is significant without explicitly providing an index. Using an explicit magic number and letting callers omit the index avoids confusion by eliminating the myriad cases where KVM specifies '0' as a dummy value. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0729e434c51b..f389691d8c04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11735,7 +11735,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); -- cgit v1.2.3 From 94bda2f4cd868046094351b06b87d7d1053e990d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:13 +0000 Subject: KVM: x86: Reject loading KVM if host.PAT[0] != WB Reject KVM if entry '0' in the host's IA32_PAT MSR is not programmed to writeback (WB) memtype. KVM subtly relies on IA32_PAT entry '0' to be programmed to WB by leaving the PAT bits in shadow paging and NPT SPTEs as '0'. If something other than WB is in PAT[0], at _best_ guests will suffer very poor performance, and at worst KVM will crash the system by breaking cache-coherency expecations (e.g. using WC for guest memory). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f389691d8c04..12199c40f2bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9141,6 +9141,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { @@ -9179,6 +9180,20 @@ int kvm_arch_init(void *opaque) goto out; } + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + r = -EIO; + goto out; + } + r = -ENOMEM; x86_emulator_cache = kvm_alloc_emulator_cache(); -- cgit v1.2.3 From 82ffad2ddf5d7b5b9c14c705fed4a7d5f2ec4c38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Jul 2022 23:00:14 +0000 Subject: KVM: x86: Drop unnecessary goto+label in kvm_arch_init() Return directly if kvm_arch_init() detects an error before doing any real work, jumping through a label obfuscates what's happening and carries the unnecessary risk of leaving 'r' uninitialized. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20220715230016.3762909-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 12199c40f2bc..41aa3137665c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9146,21 +9146,18 @@ int kvm_arch_init(void *opaque) if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -9170,14 +9167,12 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -9190,21 +9185,19 @@ int kvm_arch_init(void *opaque) if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("kvm: host PAT[0] is not WB\n"); - r = -EIO; - goto out; + return -EIO; } - r = -ENOMEM; - x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9235,7 +9228,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } -- cgit v1.2.3 From c33f6f2228fe8517e38941a508e9f905f99ecba9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 21:35:50 +0000 Subject: KVM: x86: Split kvm_is_valid_cr4() and export only the non-vendor bits Split the common x86 parts of kvm_is_valid_cr4(), i.e. the reserved bits checks, into a separate helper, __kvm_is_valid_cr4(), and export only the inner helper to vendor code in order to prevent nested VMX from calling back into vmx_is_valid_cr4() via kvm_is_valid_cr4(). On SVM, this is a nop as SVM doesn't place any additional restrictions on CR4. On VMX, this is also currently a nop, but only because nested VMX is missing checks on reserved CR4 bits for nested VM-Enter. That bug will be fixed in a future patch, and could simply use kvm_is_valid_cr4() as-is, but nVMX has _another_ bug where VMXON emulation doesn't enforce VMX's restrictions on CR0/CR4. The cleanest and most intuitive way to fix the VMXON bug is to use nested_host_cr{0,4}_valid(). If the CR4 variant routes through kvm_is_valid_cr4(), using nested_host_cr4_valid() won't do the right thing for the VMXON case as vmx_is_valid_cr4() enforces VMX's restrictions if and only if the vCPU is post-VMXON. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220607213604.3346000-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41aa3137665c..5366f884e9a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1081,7 +1081,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1089,9 +1089,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { -- cgit v1.2.3