From 07ea4ab1f9b83953ff5c3f6ccfb84d581bfe0046 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 22 Mar 2022 08:29:06 -0700 Subject: KVM: x86: Fix clang -Wimplicit-fallthrough in do_host_cpuid() Clang warns: arch/x86/kvm/cpuid.c:739:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] default: ^ arch/x86/kvm/cpuid.c:739:2: note: insert 'break;' to avoid fall-through default: ^ break; 1 error generated. Clang is a little more pedantic than GCC, which does not warn when falling through to a case that is just break or return. Clang's version is more in line with the kernel's own stance in deprecated.rst, which states that all switch/case blocks must end in either break, fallthrough, continue, goto, or return. Add the missing break to silence the warning. Fixes: f144c49e8c39 ("KVM: x86: synthesize CPUID leaf 0x80000021h if useful") Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Message-Id: <20220322152906.112164-1-nathan@kernel.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 58b0b4e0263c..a3c87d2882ad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -735,6 +735,7 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, if (function > READ_ONCE(max_cpuid_80000000)) return entry; } + break; default: break; -- cgit v1.2.3 From 7ec37d1cbe17d8189d9562178d8b29167fe1c31a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Mar 2022 14:21:38 +0100 Subject: KVM: x86: Check lapic_in_kernel() before attempting to set a SynIC irq When KVM_CAP_HYPERV_SYNIC{,2} is activated, KVM already checks for irqchip_in_kernel() so normally SynIC irqs should never be set. It is, however, possible for a misbehaving VMM to write to SYNIC/STIMER MSRs causing erroneous behavior. The immediate issue being fixed is that kvm_irq_delivery_to_apic() (kvm_irq_delivery_to_apic_fast()) crashes when called with 'irq.shorthand = APIC_DEST_SELF' and 'src == NULL'. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220325132140.25650-2-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a32f54ab84a2..f715b5a2b0e4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -449,6 +449,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) struct kvm_lapic_irq irq; int ret, vector; + if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return -EINVAL; + if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; -- cgit v1.2.3 From 00b5f37189d24ac3ed46cb7f11742094778c46ce Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Mar 2022 14:21:39 +0100 Subject: KVM: x86: Avoid theoretical NULL pointer dereference in kvm_irq_delivery_to_apic_fast() When kvm_irq_delivery_to_apic_fast() is called with APIC_DEST_SELF shorthand, 'src' must not be NULL. Crash the VM with KVM_BUG_ON() instead of crashing the host. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220325132140.25650-3-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 80a2020c4db4..66b0eb0bda94 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1024,6 +1024,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { + if (KVM_BUG_ON(!src, kvm)) { + *r = 0; + return true; + } *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } -- cgit v1.2.3 From b1e34d325397a33d97d845e312d7cf2a8b646b44 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Mar 2022 14:21:40 +0100 Subject: KVM: x86: Forbid VMM to set SYNIC/STIMER MSRs when SynIC wasn't activated Setting non-zero values to SYNIC/STIMER MSRs activates certain features, this should not happen when KVM_CAP_HYPERV_SYNIC{,2} was not activated. Note, it would've been better to forbid writing anything to SYNIC/STIMER MSRs, including zeroes, however, at least QEMU tries clearing HV_X64_MSR_STIMER0_CONFIG without SynIC. HV_X64_MSR_EOM MSR is somewhat 'special' as writing zero there triggers an action, this also should not happen when SynIC wasn't activated. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220325132140.25650-4-vkuznets@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f715b5a2b0e4..4177c17a26bf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -239,7 +239,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; - if (!synic->active && !host) + if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -285,6 +285,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, case HV_X64_MSR_EOM: { int i; + if (!synic->active) + break; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; @@ -664,7 +667,7 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); - if (!synic->active && !host) + if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && @@ -693,7 +696,7 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); - if (!synic->active && !host) + if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, -- cgit v1.2.3 From a1a39128faabc9883a7f9e3f8777b3fbd560fa5f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 25 Mar 2022 12:42:52 -0400 Subject: KVM: MMU: propagate alloc_workqueue failure If kvm->arch.tdp_mmu_zap_wq cannot be created, the failure has to be propagated up to kvm_mmu_init_vm and kvm_arch_init_vm. kvm_arch_init_vm also has to undo all the initialization, so group all the MMU initialization code at the beginning and handle cleaning up of kvm_page_track_init. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 11 +++++++++-- arch/x86/kvm/mmu/tdp_mmu.c | 17 ++++++++++------- arch/x86/kvm/mmu/tdp_mmu.h | 4 ++-- arch/x86/kvm/x86.c | 15 ++++++++++----- 5 files changed, 32 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0ddc2e67a731..469c7702fad9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1584,7 +1584,7 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -void kvm_mmu_init_vm(struct kvm *kvm); +int kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 51671cb34fb6..857ba93b5c92 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5768,17 +5768,24 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, kvm_mmu_zap_all_fast(kvm); } -void kvm_mmu_init_vm(struct kvm *kvm) +int kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + int r; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - kvm_mmu_init_tdp_mmu(kvm); + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + return 0; } void kvm_mmu_uninit_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e7e7876251b3..4be517a9f22f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,21 +14,24 @@ static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) +int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { + struct workqueue_struct *wq; + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return false; + return 0; + + wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); + if (!wq) + return -ENOMEM; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); - kvm->arch.tdp_mmu_zap_wq = - alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - - return true; + kvm->arch.tdp_mmu_zap_wq = wq; + return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 5e5ef2576c81..647926541e38 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -72,7 +72,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } @@ -93,7 +93,7 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else -static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe2171b11441..89b6efb7f504 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11629,12 +11629,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) ret = kvm_page_track_init(kvm); if (ret) - return ret; + goto out; + + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_page_track; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -11666,10 +11667,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_mmu_init_vm(kvm); kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); + +out_page_track: + kvm_page_track_cleanup(kvm); +out: + return ret; } int kvm_arch_post_init_vm(struct kvm *kvm) -- cgit v1.2.3 From a80ced6ea514000d34bf1239d47553de0d1ee89e Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 9 Mar 2022 19:30:25 +0800 Subject: KVM: SVM: fix panic on out-of-bounds guest IRQ As guest_irq is coming from KVM_IRQFD API call, it may trigger crash in svm_update_pi_irte() due to out-of-bounds: crash> bt PID: 22218 TASK: ffff951a6ad74980 CPU: 73 COMMAND: "vcpu8" #0 [ffffb1ba6707fa40] machine_kexec at ffffffff8565b397 #1 [ffffb1ba6707fa90] __crash_kexec at ffffffff85788a6d #2 [ffffb1ba6707fb58] crash_kexec at ffffffff8578995d #3 [ffffb1ba6707fb70] oops_end at ffffffff85623c0d #4 [ffffb1ba6707fb90] no_context at ffffffff856692c9 #5 [ffffb1ba6707fbf8] exc_page_fault at ffffffff85f95b51 #6 [ffffb1ba6707fc50] asm_exc_page_fault at ffffffff86000ace [exception RIP: svm_update_pi_irte+227] RIP: ffffffffc0761b53 RSP: ffffb1ba6707fd08 RFLAGS: 00010086 RAX: ffffb1ba6707fd78 RBX: ffffb1ba66d91000 RCX: 0000000000000001 RDX: 00003c803f63f1c0 RSI: 000000000000019a RDI: ffffb1ba66db2ab8 RBP: 000000000000019a R8: 0000000000000040 R9: ffff94ca41b82200 R10: ffffffffffffffcf R11: 0000000000000001 R12: 0000000000000001 R13: 0000000000000001 R14: ffffffffffffffcf R15: 000000000000005f ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffb1ba6707fdb8] kvm_irq_routing_update at ffffffffc09f19a1 [kvm] #8 [ffffb1ba6707fde0] kvm_set_irq_routing at ffffffffc09f2133 [kvm] #9 [ffffb1ba6707fe18] kvm_vm_ioctl at ffffffffc09ef544 [kvm] RIP: 00007f143c36488b RSP: 00007f143a4e04b8 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f05780041d0 RCX: 00007f143c36488b RDX: 00007f05780041d0 RSI: 000000004008ae6a RDI: 0000000000000020 RBP: 00000000000004e8 R8: 0000000000000008 R9: 00007f05780041e0 R10: 00007f0578004560 R11: 0000000000000246 R12: 00000000000004e0 R13: 000000000000001a R14: 00007f1424001c60 R15: 00007f0578003bc0 ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b Vmx have been fix this in commit 3a8b0677fc61 (KVM: VMX: Do not BUG() on out-of-bounds guest IRQ), so we can just copy source from that to fix this. Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Yi Wang Message-Id: <20220309113025.44469-1-wang.yi59@zte.com.cn> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b37b353ec086..0df0c3ad08f9 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -726,7 +726,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP)) @@ -737,7 +737,13 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { struct vcpu_data vcpu_info; -- cgit v1.2.3 From f47e5bbbc92f5d234bbab317523c64a65b6ac4e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Mar 2022 23:03:48 +0000 Subject: KVM: x86/mmu: Zap only TDP MMU leafs in zap range and mmu_notifier unmap Re-introduce zapping only leaf SPTEs in kvm_zap_gfn_range() and kvm_tdp_mmu_unmap_gfn_range(), this time without losing a pending TLB flush when processing multiple roots (including nested TDP shadow roots). Dropping the TLB flush resulted in random crashes when running Hyper-V Server 2019 in a guest with KSM enabled in the host (or any source of mmu_notifier invalidations, KSM is just the easiest to force). This effectively revert commits 873dd122172f8cce329113cfb0dfe3d2344d80c0 and fcb93eb6d09dd302cbef22bd95a5858af75e4156, and thus restores commit cf3e26427c08ad9015956293ab389004ac6a338e, plus this delta on top: bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, struct kvm_mmu_page *root; for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false); + flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); return flush; } Cc: Ben Gardon Signed-off-by: Sean Christopherson Tested-by: Vitaly Kuznetsov Message-Id: <20220325230348.2587437-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 55 ++++++++++++++-------------------------------- arch/x86/kvm/mmu/tdp_mmu.h | 8 +------ 3 files changed, 19 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 857ba93b5c92..333aaf6d6806 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5849,8 +5849,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush); + flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, + gfn_end, true, flush); } if (flush) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4be517a9f22f..d71d177ae6b8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -909,10 +909,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs + * have been cleared and a TLB flush is needed before releasing the MMU lock. * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this @@ -920,42 +918,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) * the caller must ensure it does not supply too large a GFN range, or the * operation can cause a soft lockup. */ -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush) +static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush) { - bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host()); struct tdp_iter iter; - /* - * No need to try to step down in the iterator when zapping all SPTEs, - * zapping the top-level non-leaf SPTEs will recurse on their children. - */ - int min_level = zap_all ? root->role.level : PG_LEVEL_4K; - end = min(end, tdp_mmu_max_gfn_host()); lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; continue; } - if (!is_shadow_present_pte(iter.old_spte)) - continue; - - /* - * If this is a non-last-level SPTE that covers a larger range - * than should be zapped, continue, and zap the mappings at a - * lower level, except when zapping all SPTEs. - */ - if (!zap_all && - (iter.gfn < start || - iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -963,17 +944,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, flush = true; } - /* - * Need to flush before releasing RCU. TODO: do it only if intermediate - * page tables were zapped; there is no need to flush under RCU protection - * if no 'struct kvm_mmu_page' is freed. - */ - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, start, end - start); - rcu_read_unlock(); - return false; + /* + * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need + * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. + */ + return flush; } /* @@ -982,13 +959,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush) +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, + bool can_yield, bool flush) { struct kvm_mmu_page *root; for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); return flush; } @@ -1236,8 +1213,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, - range->end, range->may_block, flush); + return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, + range->end, range->may_block, flush); } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 647926541e38..c163f7cc23ca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -15,14 +15,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush); -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush) -{ - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); -} - bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -- cgit v1.2.3 From df06dae3f2a8bfb83683abf88d3dcde23fc8093d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Feb 2022 16:53:02 +0000 Subject: KVM: Don't actually set a request when evicting vCPUs for GFN cache invd Don't actually set a request bit in vcpu->requests when making a request purely to force a vCPU to exit the guest. Logging a request but not actually consuming it would cause the vCPU to get stuck in an infinite loop during KVM_RUN because KVM would see the pending request and bail from VM-Enter to service the request. Note, it's currently impossible for KVM to set KVM_REQ_GPC_INVALIDATE as nothing in KVM is wired up to set guest_uses_pa=true. But, it'd be all too easy for arch code to introduce use of kvm_gfn_to_pfn_cache_init() without implementing handling of the request, especially since getting test coverage of MMU notifier interaction with specific KVM features usually requires a directed test. Opportunistically rename gfn_to_pfn_cache_invalidate_start()'s wake_vcpus to evict_vcpus. The purpose of the request is to get vCPUs out of guest mode, it's supposed to _avoid_ waking vCPUs that are blocking. Opportunistically rename KVM_REQ_GPC_INVALIDATE to be more specific as to what it wants to accomplish, and to genericize the name so that it can used for similar but unrelated scenarios, should they arise in the future. Add a comment and documentation to explain why the "no action" request exists. Add compile-time assertions to help detect improper usage. Use the inner assertless helper in the one s390 path that makes requests without a hardcoded request. Cc: David Woodhouse Signed-off-by: Sean Christopherson Message-Id: <20220223165302.3205276-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ca96f84db2cc..91c0cefc94aa 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3463,7 +3463,7 @@ void exit_sie(struct kvm_vcpu *vcpu) /* Kick a guest cpu out of SIE to process a request synchronously */ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { - kvm_make_request(req, vcpu); + __kvm_make_request(req, vcpu); kvm_s390_vcpu_request(vcpu); } -- cgit v1.2.3 From 95b065bf5c431c06c68056a03a5853b660640ecc Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 7 Mar 2022 17:24:52 -0800 Subject: KVM: x86/pmu: Use different raw event masks for AMD and Intel The third nybble of AMD's event select overlaps with Intel's IN_TX and IN_TXCP bits. Therefore, we can't use AMD64_RAW_EVENT_MASK on Intel platforms that support TSX. Declare a raw_event_mask in the kvm_pmu structure, initialize it in the vendor-specific pmu_refresh() functions, and use that mask for PERF_TYPE_RAW configurations in reprogram_gp_counter(). Fixes: 710c47651431 ("KVM: x86/pmu: Use AMD64_RAW_EVENT_MASK for PERF_TYPE_RAW") Signed-off-by: Jim Mattson Message-Id: <20220308012452.3468611-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 3 ++- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 469c7702fad9..47de0226d674 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -511,6 +511,7 @@ struct kvm_pmu { u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; u64 reserved_bits; + u64 raw_event_mask; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b1a02993782b..902b6d700215 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -185,6 +185,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; + struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -221,7 +222,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & AMD64_RAW_EVENT_MASK; + config = eventsel & pmu->raw_event_mask; if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) return; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d4de52409335..e4163f75c11b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -284,6 +284,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; + pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4e5b1eeeb77c..da71160a50d6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -485,6 +485,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) -- cgit v1.2.3 From 4a9e7b9ea252842bc8b14d495706ac6317fafd5d Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Fri, 4 Mar 2022 08:10:32 -0800 Subject: KVM: SVM: Fix kvm_cache_regs.h inclusions for is_guest_mode() Include kvm_cache_regs.h to pick up the definition of is_guest_mode(), which is referenced by nested_svm_virtualize_tpr() in svm.h. Remove include from svm_onhpyerv.c which was done only because of lack of include in svm.h. Fixes: 883b0a91f41ab ("KVM: SVM: Move Nested SVM Implementation to nested.c") Cc: Paolo Bonzini Cc: Sean Christopherson Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Gonda Message-Id: <20220304161032.2270688-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/svm/svm_onhyperv.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e37bb3508cfa..f01bf487babd 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,6 +22,8 @@ #include #include +#include "kvm_cache_regs.h" + #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) #define IOPM_SIZE PAGE_SIZE * 3 diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 98aa981c04ec..8cdc62c74a96 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -4,7 +4,6 @@ */ #include -#include "kvm_cache_regs.h" #include -- cgit v1.2.3 From d0d96121d03d6d9cf608d948247a9f24f5a02da9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 3 Mar 2022 15:41:11 +0000 Subject: KVM: Use enum to track if cached PFN will be used in guest and/or host Replace the guest_uses_pa and kernel_map booleans in the PFN cache code with a unified enum/bitmask. Using explicit names makes it easier to review and audit call sites. Opportunistically add a WARN to prevent passing garbage; instantating a cache without declaring its usage is either buggy or pointless. Signed-off-by: Sean Christopherson Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4aa0f2b31665..5be1c9227105 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -39,7 +39,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, PAGE_SIZE, false); if (ret) goto out; -- cgit v1.2.3 From cf1d88b36ba7e83bdaa50bccc4c47864e8f08cbe Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:12 +0000 Subject: KVM: Remove dirty handling from gfn_to_pfn_cache completely It isn't OK to cache the dirty status of a page in internal structures for an indefinite period of time. Any time a vCPU exits the run loop to userspace might be its last; the VMM might do its final check of the dirty log, flush the last remaining dirty pages to the destination and complete a live migration. If we have internal 'dirty' state which doesn't get flushed until the vCPU is finally destroyed on the source after migration is complete, then we have lost data because that will escape the final copy. This problem already exists with the use of kvm_vcpu_unmap() to mark pages dirty in e.g. VMX nesting. Note that the actual Linux MM already considers the page to be dirty since we have a writeable mapping of it. This is just about the KVM dirty logging. For the nesting-style use cases (KVM_GUEST_USES_PFN) we will need to track which gfn_to_pfn_caches have been used and explicitly mark the corresponding pages dirty before returning to userspace. But we would have needed external tracking of that anyway, rather than walking the full list of GPCs to find those belonging to this vCPU which are dirty. So let's rely *solely* on that external tracking, and keep it simple rather than laying a tempting trap for callers to fall into. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 5be1c9227105..bf6cc25eee76 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -40,7 +40,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) do { ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, - gpa, PAGE_SIZE, false); + gpa, PAGE_SIZE); if (ret) goto out; @@ -1025,8 +1025,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, - PAGE_SIZE, false); + rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); -- cgit v1.2.3 From 5b22bbe717d9b96867783cde380c43f6cc28aafd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 11 Mar 2022 15:03:41 +0800 Subject: KVM: X86: Change the type of access u32 to u64 Change the type of access u32 to u64 for FNAME(walk_addr) and ->gva_to_gpa(). The kinds of accesses are usually combinations of UWX, and VMX/SVM's nested paging adds a new factor of access: is it an access for a guest page table or for a final guest physical address. And SMAP relies a factor for supervisor access: explicit or implicit. So @access in FNAME(walk_addr) and ->gva_to_gpa() is better to include all these information to do the walk. Although @access(u32) has enough bits to encode all the kinds, this patch extends it to u64: o Extra bits will be in the higher 32 bits, so that we can easily obtain the traditional access mode (UWX) by converting it to u32. o Reuse the value for the access kind defined by SVM's nested paging (PFERR_GUEST_FINAL_MASK and PFERR_GUEST_PAGE_MASK) as @error_code in kvm_handle_page_fault(). Signed-off-by: Lai Jiangshan Message-Id: <20220311070346.45023-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.h | 8 +++++--- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 8 ++++---- arch/x86/kvm/x86.c | 24 ++++++++++++------------ 5 files changed, 23 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 47de0226d674..71cfc9fdf946 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -430,7 +430,7 @@ struct kvm_mmu { void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t gva_or_gpa, u32 access, + gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bf8dbc4bb12a..74efeaefa8f8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -214,8 +214,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, - unsigned pfec) + u64 access) { + /* strip nested paging fault error codes */ + unsigned int pfec = access; int cpl = static_call(kvm_x86_get_cpl)(vcpu); unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); @@ -317,12 +319,12 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) atomic64_add(count, &kvm->stat.pages[level - 1]); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception); static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t gpa, u32 access, + gpa_t gpa, u64 access, struct x86_exception *exception) { if (mmu != &vcpu->arch.nested_mmu) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 333aaf6d6806..7efa916f93ce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3703,7 +3703,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t vaddr, u32 access, + gpa_t vaddr, u64 access, struct x86_exception *exception) { if (exception) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 252c77805eb9..8621188b46df 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -339,7 +339,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t addr, u32 access) + gpa_t addr, u64 access) { int ret; pt_element_t pte; @@ -347,7 +347,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; - unsigned nested_access; + u64 nested_access; gpa_t pte_gpa; bool have_ad; int offset; @@ -540,7 +540,7 @@ error: } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gpa_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u64 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); @@ -988,7 +988,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t addr, u32 access, + gpa_t addr, u64 access, struct x86_exception *exception) { struct guest_walker walker; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 89b6efb7f504..5c5be7a63b9f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6726,7 +6726,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, static_call(kvm_x86_get_segment)(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -6746,7 +6746,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); @@ -6756,7 +6756,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -6766,7 +6766,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -6782,7 +6782,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -6819,7 +6819,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -6844,7 +6844,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -6863,7 +6863,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception, bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = 0; + u64 access = 0; if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; @@ -6881,7 +6881,7 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, } static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -6915,7 +6915,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = PFERR_WRITE_MASK; + u64 access = PFERR_WRITE_MASK; if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; @@ -6984,7 +6984,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, bool write) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) + u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -12598,7 +12598,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; struct x86_exception fault; - u32 access = error_code & + u64 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || -- cgit v1.2.3 From 94b4a2f1745fe7aaa551d3d6a9424b6616587c72 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 11 Mar 2022 15:03:42 +0800 Subject: KVM: X86: Fix comments in update_permission_bitmask The commit 09f037aa48f3 ("KVM: MMU: speedup update_permission_bitmask") refactored the code of update_permission_bitmask() and change the comments. It added a condition into a list to match the new code, so the number/order for conditions in the comments should be updated too. Signed-off-by: Lai Jiangshan Message-Id: <20220311070346.45023-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7efa916f93ce..9088c50cc6dd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4594,8 +4594,8 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) * - Page fault in kernel mode * - if CPL = 3 or X86_EFLAGS_AC is clear * - * Here, we cover the first three conditions. - * The fourth is computed dynamically in permission_fault(); + * Here, we cover the first four conditions. + * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions. */ -- cgit v1.2.3 From 8873c1434fac436f68ac2221676bb148e8fdffa2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 11 Mar 2022 15:03:43 +0800 Subject: KVM: X86: Rename variable smap to not_smap in permission_fault() Comments above the variable says the bit is set when SMAP is overridden or the same meaning in update_permission_bitmask(): it is not subjected to SMAP restriction. Renaming it to reflect the negative implication and make the code better readability. Signed-off-by: Lai Jiangshan Message-Id: <20220311070346.45023-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 74efeaefa8f8..24d94f6d378d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -234,9 +234,9 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * but it will be one in index if SMAP checks are being overridden. * It is important to keep this branchless. */ - unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + unsigned long not_smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + - (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + (not_smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; -- cgit v1.2.3 From 4f4aa80e3b882cbbafdf95ebc018c72b10879dbb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 11 Mar 2022 15:03:44 +0800 Subject: KVM: X86: Handle implicit supervisor access with SMAP There are two kinds of implicit supervisor access implicit supervisor access when CPL = 3 implicit supervisor access when CPL < 3 Current permission_fault() handles only the first kind for SMAP. But if the access is implicit when SMAP is on, data may not be read nor write from any user-mode address regardless the current CPL. So the second kind should be also supported. The first kind can be detect via CPL and access mode: if it is supervisor access and CPL = 3, it must be implicit supervisor access. But it is not possible to detect the second kind without extra information, so this patch adds an artificial PFERR_EXPLICIT_ACCESS into @access. This extra information also works for the first kind, so the logic is changed to use this information for both cases. The value of PFERR_EXPLICIT_ACCESS is deliberately chosen to be bit 48 which is in the most significant 16 bits of u64 and less likely to be forced to change due to future hardware uses it. This patch removes the call to ->get_cpl() for access mode is determined by @access. Not only does it reduce a function call, but also remove confusions when the permission is checked for nested TDP. The nested TDP shouldn't have SMAP checking nor even the L2's CPL have any bearing on it. The original code works just because it is always user walk for NPT and SMAP fault is not set for EPT in update_permission_bitmask. Signed-off-by: Lai Jiangshan Message-Id: <20220311070346.45023-5-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.h | 24 +++++++++++------------- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 8 ++++++-- 4 files changed, 21 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 71cfc9fdf946..a2224037d232 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -249,6 +249,7 @@ enum x86_intercept_stage; #define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 +#define PFERR_IMPLICIT_ACCESS_BIT 48 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -259,6 +260,7 @@ enum x86_intercept_stage; #define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 24d94f6d378d..e6cae6f22683 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -218,25 +218,23 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, { /* strip nested paging fault error codes */ unsigned int pfec = access; - int cpl = static_call(kvm_x86_get_cpl)(vcpu); unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); /* - * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. + * For implicit supervisor accesses, SMAP cannot be overridden. * - * If CPL = 3, SMAP applies to all supervisor-mode data accesses - * (these are implicit supervisor accesses) regardless of the value - * of EFLAGS.AC. + * SMAP works on supervisor accesses only, and not_smap can + * be set or not set when user access with neither has any bearing + * on the result. * - * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving - * the result in X86_EFLAGS_AC. We then insert it in place of - * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, - * but it will be one in index if SMAP checks are being overridden. - * It is important to keep this branchless. + * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit; + * this bit will always be zero in pfec, but it will be one in index + * if SMAP checks are being disabled. */ - unsigned long not_smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); - int index = (pfec >> 1) + - (not_smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; + bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; + int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9088c50cc6dd..e6ce39a798b8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4591,8 +4591,8 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch - * - Page fault in kernel mode - * - if CPL = 3 or X86_EFLAGS_AC is clear + * - The access is supervisor mode + * - If implicit supervisor access or X86_EFLAGS_AC is clear * * Here, we cover the first four conditions. * The fifth is computed dynamically in permission_fault(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c5be7a63b9f..a43026bac138 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6865,7 +6865,9 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u64 access = 0; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -6917,7 +6919,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u64 access = PFERR_WRITE_MASK; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, -- cgit v1.2.3 From 7491b7b2e1c57990dcd0f60ed2f3f1c92a145486 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 11 Mar 2022 04:35:15 +0000 Subject: KVM: x86: Make APICv inhibit reasons an enum and cleanup naming Use an enum for the APICv inhibit reasons, there is no meaning behind their values and they most definitely are not "unsigned longs". Rename the various params to "reason" for consistency and clarity (inhibit may be confused as a command, i.e. inhibit APICv, instead of the reason that is getting toggled/checked). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220311043517.17027-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 25 +++++++++++++------------ arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/trace.h | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 19 +++++++++++-------- 6 files changed, 35 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a2224037d232..82d1493b56f5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1042,14 +1042,16 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; -#define APICV_INHIBIT_REASON_DISABLE 0 -#define APICV_INHIBIT_REASON_HYPERV 1 -#define APICV_INHIBIT_REASON_NESTED 2 -#define APICV_INHIBIT_REASON_IRQWIN 3 -#define APICV_INHIBIT_REASON_PIT_REINJ 4 -#define APICV_INHIBIT_REASON_X2APIC 5 -#define APICV_INHIBIT_REASON_BLOCKIRQ 6 -#define APICV_INHIBIT_REASON_ABSENT 7 +enum kvm_apicv_inhibit { + APICV_INHIBIT_REASON_DISABLE, + APICV_INHIBIT_REASON_HYPERV, + APICV_INHIBIT_REASON_NESTED, + APICV_INHIBIT_REASON_IRQWIN, + APICV_INHIBIT_REASON_PIT_REINJ, + APICV_INHIBIT_REASON_X2APIC, + APICV_INHIBIT_REASON_BLOCKIRQ, + APICV_INHIBIT_REASON_ABSENT, +}; struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1403,7 +1405,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*check_apicv_inhibit_reasons)(ulong bit); + bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1798,10 +1800,9 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, - unsigned long bit); - + enum kvm_apicv_inhibit reason); void __kvm_request_apicv_update(struct kvm *kvm, bool activate, - unsigned long bit); + enum kvm_apicv_inhibit reason); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0df0c3ad08f9..a1cf9c31273b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -828,7 +828,7 @@ out: return ret; } -bool avic_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | @@ -839,7 +839,7 @@ bool avic_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); - return supported & BIT(bit); + return supported & BIT(reason); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f01bf487babd..55376816a726 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -594,7 +594,7 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool avic_check_apicv_inhibit_reasons(ulong bit); +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 193f5ba930d1..cf3e4838c86a 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1340,22 +1340,22 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, ); TRACE_EVENT(kvm_apicv_update_request, - TP_PROTO(bool activate, unsigned long bit), - TP_ARGS(activate, bit), + TP_PROTO(bool activate, int reason), + TP_ARGS(activate, reason), TP_STRUCT__entry( __field(bool, activate) - __field(unsigned long, bit) + __field(int, reason) ), TP_fast_assign( __entry->activate = activate; - __entry->bit = bit; + __entry->reason = reason; ), - TP_printk("%s bit=%lu", + TP_printk("%s reason=%u", __entry->activate ? "activate" : "deactivate", - __entry->bit) + __entry->reason) ); TRACE_EVENT(kvm_apicv_accept_irq, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e8963f5af618..fb8d5b6d05f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7705,14 +7705,14 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(ulong bit) +static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); - return supported & BIT(bit); + return supported & BIT(reason); } static struct kvm_x86_ops vmx_x86_ops __initdata = { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a43026bac138..143815142678 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9744,24 +9744,25 @@ out: } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, + enum kvm_apicv_inhibit reason) { unsigned long old, new; lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason)) return; old = new = kvm->arch.apicv_inhibit_reasons; if (activate) - __clear_bit(bit, &new); + __clear_bit(reason, &new); else - __set_bit(bit, &new); + __set_bit(reason, &new); if (!!old != !!new) { - trace_kvm_apicv_update_request(activate, bit); + trace_kvm_apicv_update_request(activate, reason); /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid * false positives in the sanity check WARN in svm_vcpu_run(). @@ -9780,17 +9781,19 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); kvm_zap_gfn_range(kvm, gfn, gfn+1); } - } else + } else { kvm->arch.apicv_inhibit_reasons = new; + } } -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void kvm_request_apicv_update(struct kvm *kvm, bool activate, + enum kvm_apicv_inhibit reason) { if (!enable_apicv) return; down_write(&kvm->arch.apicv_update_lock); - __kvm_request_apicv_update(kvm, activate, bit); + __kvm_request_apicv_update(kvm, activate, reason); up_write(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); -- cgit v1.2.3 From 320af55a930f30ba49d7cd663280d46705e11383 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 11 Mar 2022 04:35:16 +0000 Subject: KVM: x86: Add wrappers for setting/clearing APICv inhibits Add set/clear wrappers for toggling APICv inhibits to make the call sites more readable, and opportunistically rename the inner helpers to align with the new wrappers and to make them more readable as well. Invert the flag from "activate" to "set"; activate is painfully ambiguous as it's not obvious if the inhibit is being activated, or if APICv is being activated, in which case the inhibit is being deactivated. For the functions that take @set, swap the order of the inhibit reason and @set so that the call sites are visually similar to those that bounce through the wrapper. Signed-off-by: Sean Christopherson Message-Id: <20220311043517.17027-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 20 ++++++++++++++++---- arch/x86/kvm/hyperv.c | 10 +++++++--- arch/x86/kvm/i8254.c | 6 ++---- arch/x86/kvm/svm/svm.c | 11 +++++------ arch/x86/kvm/trace.h | 8 ++++---- arch/x86/kvm/x86.c | 30 +++++++++++++++--------------- 6 files changed, 49 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 82d1493b56f5..93a2671a57cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1799,10 +1799,22 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); -void kvm_request_apicv_update(struct kvm *kvm, bool activate, - enum kvm_apicv_inhibit reason); -void __kvm_request_apicv_update(struct kvm *kvm, bool activate, - enum kvm_apicv_inhibit reason); +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); + +static inline void kvm_set_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, true); +} + +static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, false); +} int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4177c17a26bf..123b677111c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -122,9 +122,13 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else hv->synic_auto_eoi_used--; - __kvm_request_apicv_update(vcpu->kvm, - !hv->synic_auto_eoi_used, - APICV_INHIBIT_REASON_HYPERV); + /* + * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on + * the hypervisor to manually inject IRQs. + */ + __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, + APICV_INHIBIT_REASON_HYPERV, + !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0b65a764ed3a..1c83076091af 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -305,15 +305,13 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) * So, deactivate APICv when PIT is in reinject mode. */ if (reinject) { - kvm_request_apicv_update(kvm, false, - APICV_INHIBIT_REASON_PIT_REINJ); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); /* The initial state is preserved while ps->reinject == 0. */ kvm_pit_reset_reinject(pit); kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } else { - kvm_request_apicv_update(kvm, true, - APICV_INHIBIT_REASON_PIT_REINJ); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0884c3414a1b..e42586aff5c4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2918,7 +2918,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); + kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3516,7 +3516,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3948,6 +3948,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; + struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -3974,16 +3975,14 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * is exposed to the guest, disable AVIC. */ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_X2APIC); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); /* * Currently, AVIC does not work with nested virtualization. * So, we disable AVIC when cpuid for SVM is set in the L1 guest. */ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_NESTED); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED); } init_vmcb_after_set_cpuid(vcpu); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index cf3e4838c86a..105037a251b5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1340,17 +1340,17 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, ); TRACE_EVENT(kvm_apicv_update_request, - TP_PROTO(bool activate, int reason), - TP_ARGS(activate, reason), + TP_PROTO(int reason, bool activate), + TP_ARGS(reason, activate), TP_STRUCT__entry( - __field(bool, activate) __field(int, reason) + __field(bool, activate) ), TP_fast_assign( - __entry->activate = activate; __entry->reason = reason; + __entry->activate = activate; ), TP_printk("%s reason=%u", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 143815142678..ede81264c06f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5938,7 +5938,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; - kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6335,7 +6335,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; - kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -9744,8 +9744,8 @@ out: } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -void __kvm_request_apicv_update(struct kvm *kvm, bool activate, - enum kvm_apicv_inhibit reason) +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) { unsigned long old, new; @@ -9756,13 +9756,13 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, old = new = kvm->arch.apicv_inhibit_reasons; - if (activate) - __clear_bit(reason, &new); - else + if (set) __set_bit(reason, &new); + else + __clear_bit(reason, &new); if (!!old != !!new) { - trace_kvm_apicv_update_request(activate, reason); + trace_kvm_apicv_update_request(reason, !set); /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid * false positives in the sanity check WARN in svm_vcpu_run(). @@ -9786,17 +9786,17 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, } } -void kvm_request_apicv_update(struct kvm *kvm, bool activate, - enum kvm_apicv_inhibit reason) +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) { if (!enable_apicv) return; down_write(&kvm->arch.apicv_update_lock); - __kvm_request_apicv_update(kvm, activate, reason); + __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_request_apicv_update); +EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -10944,7 +10944,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) { - bool inhibit = false; + bool set = false; struct kvm_vcpu *vcpu; unsigned long i; @@ -10952,11 +10952,11 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { - inhibit = true; + set = true; break; } } - __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set); up_write(&kvm->arch.apicv_update_lock); } -- cgit v1.2.3 From 4f4c4a3ee53cc20569158a05f99b6c2d1f9c998a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 11 Mar 2022 04:35:17 +0000 Subject: KVM: x86: Trace all APICv inhibit changes and capture overall status Trace all APICv inhibit changes instead of just those that result in APICv being (un)inhibited, and log the current state. Debugging why APICv isn't working is frustrating as it's hard to see why APICv is still inhibited, and logging only the first inhibition means unnecessary onion peeling. Opportunistically drop the export of the tracepoint, it is not and should not be used by vendor code due to the need to serialize toggling via apicv_update_lock. Note, using the common flow means kvm_apicv_init() switched from atomic to non-atomic bitwise operations. The VM is unreachable at init, so non-atomic is perfectly ok. Signed-off-by: Sean Christopherson Message-Id: <20220311043517.17027-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 18 ++++++++++-------- arch/x86/kvm/x86.c | 29 +++++++++++++++++++---------- 2 files changed, 29 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 105037a251b5..e3a24b8f04be 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1339,23 +1339,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); -TRACE_EVENT(kvm_apicv_update_request, - TP_PROTO(int reason, bool activate), - TP_ARGS(reason, activate), +TRACE_EVENT(kvm_apicv_inhibit_changed, + TP_PROTO(int reason, bool set, unsigned long inhibits), + TP_ARGS(reason, set, inhibits), TP_STRUCT__entry( __field(int, reason) - __field(bool, activate) + __field(bool, set) + __field(unsigned long, inhibits) ), TP_fast_assign( __entry->reason = reason; - __entry->activate = activate; + __entry->set = set; + __entry->inhibits = inhibits; ), - TP_printk("%s reason=%u", - __entry->activate ? "activate" : "deactivate", - __entry->reason) + TP_printk("%s reason=%u, inhibits=0x%lx", + __entry->set ? "set" : "cleared", + __entry->reason, __entry->inhibits) ); TRACE_EVENT(kvm_apicv_accept_irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ede81264c06f..129003ad3784 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9062,15 +9062,29 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); + +static void set_or_clear_apicv_inhibit(unsigned long *inhibits, + enum kvm_apicv_inhibit reason, bool set) +{ + if (set) + __set_bit(reason, inhibits); + else + __clear_bit(reason, inhibits); + + trace_kvm_apicv_inhibit_changed(reason, set, *inhibits); +} + static void kvm_apicv_init(struct kvm *kvm) { + unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; + init_rwsem(&kvm->arch.apicv_update_lock); - set_bit(APICV_INHIBIT_REASON_ABSENT, - &kvm->arch.apicv_inhibit_reasons); + set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true); + if (!enable_apicv) - set_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + set_or_clear_apicv_inhibit(inhibits, + APICV_INHIBIT_REASON_ABSENT, true); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) @@ -9756,13 +9770,9 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, old = new = kvm->arch.apicv_inhibit_reasons; - if (set) - __set_bit(reason, &new); - else - __clear_bit(reason, &new); + set_or_clear_apicv_inhibit(&new, reason, set); if (!!old != !!new) { - trace_kvm_apicv_update_request(reason, !set); /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid * false positives in the sanity check WARN in svm_vcpu_run(). @@ -12945,7 +12955,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); -- cgit v1.2.3 From 9b026073db2f1ad0e4d8b61c83316c8497981037 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Sat, 26 Feb 2022 15:41:31 -0800 Subject: KVM: x86/svm: Clear reserved bits written to PerfEvtSeln MSRs AMD EPYC CPUs never raise a #GP for a WRMSR to a PerfEvtSeln MSR. Some reserved bits are cleared, and some are not. Specifically, on Zen3/Milan, bits 19 and 42 are not cleared. When emulating such a WRMSR, KVM should not synthesize a #GP, regardless of which bits are set. However, undocumented bits should not be passed through to the hardware MSR. So, rather than checking for reserved bits and synthesizing a #GP, just clear the reserved bits. This may seem pedantic, but since KVM currently does not support the "Host/Guest Only" bits (41:40), it is necessary to clear these bits rather than synthesizing #GP, because some popular guests (e.g Linux) will set the "Host Only" bit even on CPUs that don't support EFER.SVME, and they don't expect a #GP. For example, root@Ubuntu1804:~# perf stat -e r26 -a sleep 1 Performance counter stats for 'system wide': 0 r26 1.001070977 seconds time elapsed Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379957] unchecked MSR access error: WRMSR to 0xc0010200 (tried to write 0x0000020000130026) at rIP: 0xffffffff9b276a28 (native_write_msr+0x8/0x30) Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379958] Call Trace: Feb 23 03:59:58 Ubuntu1804 kernel: [ 405.379963] amd_pmu_disable_event+0x27/0x90 Fixes: ca724305a2b0 ("KVM: x86/vPMU: Implement AMD vPMU code for KVM") Reported-by: Lotus Fenn Signed-off-by: Jim Mattson Reviewed-by: Like Xu Reviewed-by: David Dunn Message-Id: <20220226234131.2167175-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/pmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index e4163f75c11b..24eb935b6f85 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -262,12 +262,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { + data &= ~pmu->reserved_bits; + if (data != pmc->eventsel) reprogram_gp_counter(pmc, data); - return 0; - } + return 0; } return 1; -- cgit v1.2.3 From 5959ff4ae96eece2f0c3dfde5d27bff70ab1ce56 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 2 Mar 2022 12:24:57 +0200 Subject: KVM: x86: mmu: trace kvm_mmu_set_spte after the new SPTE was set It makes more sense to print new SPTE value than the old value. Signed-off-by: Maxim Levitsky Reviewed-by: Sean Christopherson Message-Id: <20220302102457.588450-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e6ce39a798b8..8f19ea752704 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2696,8 +2696,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (*sptep == spte) { ret = RET_PF_SPURIOUS; } else { - trace_kvm_mmu_set_spte(level, gfn, sptep); flush |= mmu_spte_update(sptep, spte); + trace_kvm_mmu_set_spte(level, gfn, sptep); } if (wrprot) { -- cgit v1.2.3 From e644896f5106aa3f6d7e8c7adf2e4dc0fce53555 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 9 Mar 2022 16:42:57 +0800 Subject: KVM: x86/pmu: Fix and isolate TSX-specific performance event logic HSW_IN_TX* bits are used in generic code which are not supported on AMD. Worse, these bits overlap with AMD EventSelect[11:8] and hence using HSW_IN_TX* bits unconditionally in generic code is resulting in unintentional pmu behavior on AMD. For example, if EventSelect[11:8] is 0x2, pmc_reprogram_counter() wrongly assumes that HSW_IN_TX_CHECKPOINTED is set and thus forces sampling period to be 0. Also per the SDM, both bits 32 and 33 "may only be set if the processor supports HLE or RTM" and for "IN_TXCP (bit 33): this bit may only be set for IA32_PERFEVTSEL2." Opportunistically eliminate code redundancy, because if the HSW_IN_TX* bit is set in pmc->eventsel, it is already set in attr.config. Reported-by: Ravi Bangoria Reported-by: Jim Mattson Fixes: 103af0a98788 ("perf, kvm: Support the in_tx/in_tx_cp modifiers in KVM arch perfmon emulation v5") Co-developed-by: Ravi Bangoria Signed-off-by: Ravi Bangoria Signed-off-by: Like Xu Message-Id: <20220309084257.88931-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 15 +++++---------- arch/x86/kvm/vmx/pmu_intel.c | 13 ++++++++++--- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 902b6d700215..eca39f56c231 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -96,8 +96,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, - bool exclude_kernel, bool intr, - bool in_tx, bool in_tx_cp) + bool exclude_kernel, bool intr) { struct perf_event *event; struct perf_event_attr attr = { @@ -116,16 +115,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, attr.sample_period = get_sample_period(pmc, pmc->counter); - if (in_tx) - attr.config |= HSW_IN_TX; - if (in_tx_cp) { + if ((attr.config & HSW_IN_TX_CHECKPOINTED) && + guest_cpuid_is_intel(pmc->vcpu)) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least * allocating the counter doesn't fail. */ attr.sample_period = 0; - attr.config |= HSW_IN_TX_CHECKPOINTED; } event = perf_event_create_kernel_counter(&attr, -1, current, @@ -233,9 +230,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT, - (eventsel & HSW_IN_TX), - (eventsel & HSW_IN_TX_CHECKPOINTED)); + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_gp_counter); @@ -271,7 +266,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ - pmi, false, false); + pmi); } EXPORT_SYMBOL_GPL(reprogram_fixed_counter); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index da71160a50d6..efa172a7278e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -389,6 +389,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct kvm_pmc *pmc; u32 msr = msr_info->index; u64 data = msr_info->data; + u64 reserved_bits; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -443,7 +444,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & pmu->reserved_bits)) { + reserved_bits = pmu->reserved_bits; + if ((pmc->idx == 2) && + (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) + reserved_bits ^= HSW_IN_TX_CHECKPOINTED; + if (!(data & reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -534,8 +539,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) - pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + } bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); -- cgit v1.2.3 From a836839cbfe60dc434c5476a7429cf2bae36415d Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 2 Mar 2022 21:15:14 +0800 Subject: KVM: x86/emulator: Emulate RDPID only if it is enabled in guest When RDTSCP is supported but RDPID is not supported in host, RDPID emulation is available. However, __kvm_get_msr() would only fail when RDTSCP/RDPID both are disabled in guest, so the emulator wouldn't inject a #UD when RDPID is disabled but RDTSCP is enabled in guest. Fixes: fb6d4d340e05 ("KVM: x86: emulate RDPID") Signed-off-by: Hou Wenlong Message-Id: <1dfd46ae5b76d3ed87bde3154d51c64ea64c99c1.1646226788.git.houwenlong.hwl@antgroup.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 +++- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a60b4d20b309..c9f87c079d88 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3521,8 +3521,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) { u64 tsc_aux = 0; - if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + if (!ctxt->ops->guest_has_rdpid(ctxt)) return emulate_ud(ctxt); + + ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 39eded2426ff..a2a7654d8ace 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -226,6 +226,7 @@ struct x86_emulate_ops { bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 129003ad3784..2c131125774d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7728,6 +7728,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); @@ -7810,6 +7815,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .exiting_smm = emulator_exiting_smm, -- cgit v1.2.3 From ac8d6cad3c7b39633d5899dc2fa9abec7135e83e Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Mon, 7 Mar 2022 20:26:33 +0800 Subject: KVM: x86: Only do MSR filtering when access MSR by rdmsr/wrmsr If MSR access is rejected by MSR filtering, kvm_set_msr()/kvm_get_msr() would return KVM_MSR_RET_FILTERED, and the return value is only handled well for rdmsr/wrmsr. However, some instruction emulation and state transition also use kvm_set_msr()/kvm_get_msr() to do msr access but may trigger some unexpected results if MSR access is rejected, E.g. RDPID emulation would inject a #UD but RDPID wouldn't cause a exit when RDPID is supported in hardware and ENABLE_RDTSCP is set. And it would also cause failure when load MSR at nested entry/exit. Since msr filtering is based on MSR bitmap, it is better to only do MSR filtering for rdmsr/wrmsr. Signed-off-by: Hou Wenlong Message-Id: <2b2774154f7532c96a6f04d71c82a8bec7d9e80b.1646655860.git.houwenlong.hwl@antgroup.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/kvm_emulate.h | 2 ++ arch/x86/kvm/x86.c | 50 +++++++++++++++++++++++++++++++++------------- 3 files changed, 40 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c9f87c079d88..be83c9c8482d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3625,7 +3625,7 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data); if (r == X86EMUL_IO_NEEDED) return r; @@ -3642,7 +3642,7 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) u64 msr_data; int r; - r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data); if (r == X86EMUL_IO_NEEDED) return r; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index a2a7654d8ace..1cbd46cf71f9 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -210,6 +210,8 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c131125774d..bc63ba52c4d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1748,9 +1748,6 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1832,9 +1829,6 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) @@ -1871,6 +1865,20 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, return ret; } +static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + return kvm_get_msr_ignored_check(vcpu, index, data, false); +} + +static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + return kvm_set_msr_ignored_check(vcpu, index, data, false); +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); @@ -1953,7 +1961,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) u64 data; int r; - r = kvm_get_msr(vcpu, ecx, &data); + r = kvm_get_msr_with_filter(vcpu, ecx, &data); if (!r) { trace_kvm_msr_read(ecx, data); @@ -1978,7 +1986,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) u64 data = kvm_read_edx_eax(vcpu); int r; - r = kvm_set_msr(vcpu, ecx, data); + r = kvm_set_msr_with_filter(vcpu, ecx, data); if (!r) { trace_kvm_msr_write(ecx, data); @@ -7631,13 +7639,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, return; } -static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 *pdata) +static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr(vcpu, msr_index, pdata); + r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, complete_emulated_rdmsr, r)) { @@ -7648,13 +7656,13 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return r; } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) +static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr(vcpu, msr_index, data); + r = kvm_set_msr_with_filter(vcpu, msr_index, data); if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, complete_emulated_msr_access, r)) { @@ -7665,6 +7673,18 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return r; } +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) +{ + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} + +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); +} + static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -7803,6 +7823,8 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .get_smbase = emulator_get_smbase, .set_smbase = emulator_set_smbase, + .set_msr_with_filter = emulator_set_msr_with_filter, + .get_msr_with_filter = emulator_get_msr_with_filter, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, -- cgit v1.2.3 From 0dacc3df898e219fa774f39e5e10d686364e0a27 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 22 Mar 2022 19:24:45 +0200 Subject: KVM: x86: SVM: fix avic spec based definitions again Due to wrong rebase, commit 4a204f7895878 ("KVM: SVM: Allow AVIC support on system w/ physical APIC ID > 255") moved avic spec #defines back to avic.c. Move them back, and while at it extend AVIC_DOORBELL_PHYSICAL_ID_MASK to 12 bits as well (it will be used in nested avic) Signed-off-by: Maxim Levitsky Message-Id: <20220322172449.235575-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 8 +++++--- arch/x86/kvm/svm/svm.h | 11 ----------- 2 files changed, 5 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 7eb2df5417fb..ab572d8def2b 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -222,7 +222,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { /* AVIC */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFFULL) #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) @@ -230,9 +230,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) -#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF) +#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL) -#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF) +#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) + +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 55376816a726..f77a7d2d39dd 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -571,17 +571,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -- cgit v1.2.3 From bb2aa78e9a90769993d8cefd9920f81355853a98 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 22 Mar 2022 19:24:46 +0200 Subject: KVM: x86: SVM: move tsc ratio definitions to svm.h Another piece of SVM spec which should be in the header file Signed-off-by: Maxim Levitsky Message-Id: <20220322172449.235575-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 6 ++++++ arch/x86/kvm/svm/svm.c | 15 +++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index ab572d8def2b..f70a5108d464 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -221,6 +221,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) +#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL +#define SVM_TSC_RATIO_MIN 0x0000000000000001ULL +#define SVM_TSC_RATIO_MAX 0x000000ffffffffffULL +#define SVM_TSC_RATIO_DEFAULT 0x0100000000ULL + + /* AVIC */ #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFFULL) #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e42586aff5c4..3b8d82a9832c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -72,10 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -87,7 +83,6 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ @@ -480,7 +475,7 @@ static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (tsc_scaling) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -526,8 +521,8 @@ static int svm_hardware_enable(void) * Set the default value, even if we don't use TSC scaling * to avoid having stale value in the msr */ - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); + wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); + __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT); } @@ -2723,7 +2718,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; } - if (data & TSC_RATIO_RSVD) + if (data & SVM_TSC_RATIO_RSVD) return 1; svm->tsc_ratio_msr = data; @@ -4765,7 +4760,7 @@ static __init int svm_hardware_setup(void) } else { pr_info("TSC scaling supported\n"); kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; kvm_tsc_scaling_ratio_frac_bits = 32; } } -- cgit v1.2.3 From f37b735e31f418f9a607ad6ed9fda75482a8d385 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 22 Mar 2022 19:24:47 +0200 Subject: kvm: x86: SVM: remove unused defines Remove some unused #defines from svm.c Signed-off-by: Maxim Levitsky Message-Id: <20220322172449.235575-7-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3b8d82a9832c..cff941989431 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -62,14 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) static bool erratum_383_found __read_mostly; -- cgit v1.2.3 From 880993138396f8f0be620c425d08f84490c35251 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 22 Mar 2022 19:24:48 +0200 Subject: KVM: x86: SVM: fix tsc scaling when the host doesn't support it It was decided that when TSC scaling is not supported, the virtual MSR_AMD64_TSC_RATIO should still have the default '1.0' value. However in this case kvm_max_tsc_scaling_ratio is not set, which breaks various assumptions. Fix this by always calculating kvm_max_tsc_scaling_ratio regardless of host support. For consistency, do the same for VMX. Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Message-Id: <20220322172449.235575-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 4 +--- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cff941989431..bd4c64b362d2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4752,10 +4752,10 @@ static __init int svm_hardware_setup(void) } else { pr_info("TSC scaling supported\n"); kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; } } + kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fb8d5b6d05f7..1bbd3e97694c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7980,12 +7980,11 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - if (cpu_has_vmx_tsc_scaling()) { + if (cpu_has_vmx_tsc_scaling()) kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc63ba52c4d6..498cd4aaa2cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11602,10 +11602,8 @@ int kvm_arch_hardware_setup(void *opaque) u64 max = min(0x7fffffffULL, __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } -- cgit v1.2.3 From b76edfe91a87a5cca19cdca09bf6d08f08d4d6e9 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 11 Mar 2022 18:26:42 +0800 Subject: KVM: x86: cleanup enter_rmode() vmx_set_efer() sets uret->data but, in fact if the value of uret->data will be used vmx_setup_uret_msrs() will have rewritten it with the value returned by update_transition_efer(). uret->data is consumed if and only if uret->load_into_hardware is true, and vmx_setup_uret_msrs() takes care of (a) updating uret->data before setting uret->load_into_hardware to true (b) setting uret->load_into_hardware to false if uret->data isn't updated. Opportunistically use "vmx" directly instead of redoing to_vmx(). Signed-off-by: Zhenzhong Duan Message-Id: <20220311102643.807507-2-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1bbd3e97694c..2d122416b1e8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2866,21 +2866,17 @@ static void enter_rmode(struct kvm_vcpu *vcpu) int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); /* Nothing to do if hardware doesn't support EFER. */ - if (!msr) + if (!vmx_find_uret_msr(vmx, MSR_EFER)) return 0; vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); + if (efer & EFER_LMA) + vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); + else + vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); - msr->data = efer & ~EFER_LME; - } vmx_setup_uret_msrs(vmx); return 0; } -- cgit v1.2.3 From 4335edbbc1284259b17590f76a21fd4a162b4305 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 11 Mar 2022 18:26:43 +0800 Subject: KVM: x86: Remove redundant vm_entry_controls_clearbit() call When emulating exit from long mode, EFER_LMA is cleared with vmx_set_efer(). This will already unset the VM_ENTRY_IA32E_MODE control bit as requested by SDM, so there is no need to unset VM_ENTRY_IA32E_MODE again in exit_lmode() explicitly. In case EFER isn't supported by hardware, long mode isn't supported, so exit_lmode() cannot be reached. Note that, thanks to the shadow controls mechanism, this change doesn't eliminate vmread or vmwrite. Signed-off-by: Zhenzhong Duan Message-Id: <20220311102643.807507-3-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2d122416b1e8..04d170c4b61e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2902,7 +2902,6 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } -- cgit v1.2.3 From 2a8859f373b0a86f0ece8ec8312607eacf12485d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Mar 2022 12:56:24 -0400 Subject: KVM: x86/mmu: do compare-and-exchange of gPTE via the user address FNAME(cmpxchg_gpte) is an inefficient mess. It is at least decent if it can go through get_user_pages_fast(), but if it cannot then it tries to use memremap(); that is not just terribly slow, it is also wrong because it assumes that the VM_PFNMAP VMA is contiguous. The right way to do it would be to do the same thing as hva_to_pfn_remapped() does since commit add6a0cd1c5b ("KVM: MMU: try to fix up page faults before giving up", 2016-07-05), using follow_pte() and fixup_user_fault() to determine the correct address to use for memremap(). To do this, one could for example extract hva_to_pfn() for use outside virt/kvm/kvm_main.c. But really there is no reason to do that either, because there is already a perfectly valid address to do the cmpxchg() on, only it is a userspace address. That means doing user_access_begin()/user_access_end() and writing the code in assembly to handle exceptions correctly. Worse, the guest PTE can be 8-byte even on i686 so there is the extra complication of using cmpxchg8b to account for. But at least it is an efficient mess. (Thanks to Linus for suggesting improvement on the inline assembly). Reported-by: Qiuhao Li Reported-by: Gaoning Pan Reported-by: Yongkang Jia Reported-by: syzbot+6cde2282daa792c49ab8@syzkaller.appspotmail.com Debugged-by: Tadeusz Struk Tested-by: Maxim Levitsky Cc: stable@vger.kernel.org Fixes: bd53cb35a3e9 ("X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 74 +++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 8621188b46df..01fee5f67ac3 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -34,9 +34,8 @@ #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgq" #else - #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 @@ -52,7 +51,7 @@ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgl" #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -65,7 +64,9 @@ #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) - #define CMPXCHG cmpxchg64 + #ifdef CONFIG_X86_64 + #define CMPXCHG "cmpxchgq" + #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value @@ -147,43 +148,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) { - int npages; - pt_element_t ret; - pt_element_t *table; - struct page *page; - - npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); - if (likely(npages == 1)) { - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); - } else { - struct vm_area_struct *vma; - unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; - unsigned long pfn; - unsigned long paddr; - - mmap_read_lock(current->mm); - vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); - if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - paddr = pfn << PAGE_SHIFT; - table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); - if (!table) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - ret = CMPXCHG(&table[index], orig_pte, new_pte); - memunmap(table); - mmap_read_unlock(current->mm); - } + signed char r; - return (ret != orig_pte); + if (!user_access_begin(ptep_user, sizeof(pt_element_t))) + return -EFAULT; + +#ifdef CMPXCHG + asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" + "setnz %b[r]\n" + "2:" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) + : [ptr] "+m" (*ptep_user), + [old] "+a" (orig_pte), + [r] "=q" (r) + : [new] "r" (new_pte) + : "memory"); +#else + asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" + "setnz %b[r]\n" + "2:" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) + : [ptr] "+m" (*ptep_user), + [old] "+A" (orig_pte), + [r] "=q" (r) + : [new_lo] "b" ((u32)new_pte), + [new_hi] "c" ((u32)(new_pte >> 32)) + : "memory"); +#endif + + user_access_end(); + return r; } static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From c15e0ae42c8e5a61e9aca8aac920517cf7b3e94e Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 9 Mar 2022 16:35:44 +0800 Subject: KVM: x86: fix sending PV IPI If apic_id is less than min, and (max - apic_id) is greater than KVM_IPI_CLUSTER_SIZE, then the third check condition is satisfied but the new apic_id does not fit the bitmask. In this case __send_ipi_mask should send the IPI. This is mostly theoretical, but it can happen if the apic_ids on three iterations of the loop are for example 1, KVM_IPI_CLUSTER_SIZE, 0. Fixes: aaffcfd1e82 ("KVM: X86: Implement PV IPIs in linux guest") Signed-off-by: Li RongQing Message-Id: <1646814944-51801-1-git-send-email-lirongqing@baidu.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d77481ecb0d5..ed8a13ac4ab2 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -517,7 +517,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, -- cgit v1.2.3