From f1a9761fbb00639c5e73835f7f373ef834bb867f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 16 Mar 2022 00:55:37 +0000 Subject: KVM: x86: Allow userspace to opt out of hypercall patching KVM handles the VMCALL/VMMCALL instructions very strangely. Even though both of these instructions really should #UD when executed on the wrong vendor's hardware (i.e. VMCALL on SVM, VMMCALL on VMX), KVM replaces the guest's instruction with the appropriate instruction for the vendor. Nonetheless, older guest kernels without commit c1118b3602c2 ("x86: kvm: use alternatives for VMCALL vs. VMMCALL if kernel text is read-only") do not patch in the appropriate instruction using alternatives, likely motivating KVM's intervention. Add a quirk allowing userspace to opt out of hypercall patching. If the quirk is disabled, KVM synthesizes a #UD in the guest. Signed-off-by: Oliver Upton Message-Id: <20220316005538.2282772-2-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 93a2671a57cf..d9a825182b84 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1987,6 +1987,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ - KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ + KVM_X86_QUIRK_FIX_HYPERCALL_INSN) #endif /* _ASM_X86_KVM_HOST_H */ -- cgit v1.2.3 From a795cd43c5b5819b8ee94690f22caa5e2e4d8620 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:13 +0000 Subject: KVM: x86/xen: Use gfn_to_pfn_cache for runstate area Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-4-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d9a825182b84..3263147a166f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -608,10 +608,9 @@ struct kvm_vcpu_xen { u32 current_runstate; bool vcpu_info_set; bool vcpu_time_info_set; - bool runstate_set; struct gfn_to_hva_cache vcpu_info_cache; struct gfn_to_hva_cache vcpu_time_info_cache; - struct gfn_to_hva_cache runstate_cache; + struct gfn_to_pfn_cache runstate_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; -- cgit v1.2.3 From 916d3608df8265a2e3f6214200dbb0b39a5ca383 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:14 +0000 Subject: KVM: x86: Use gfn_to_pfn_cache for pv_time Add a new kvm_setup_guest_pvclock() which parallels the existing kvm_setup_pvclock_page(). The latter will be removed once we convert all users to the gfn_to_pfn_cache version. Using the new cache, we can potentially let kvm_set_guest_paused() set the PVCLOCK_GUEST_STOPPED bit directly rather than having to delegate to the vCPU via KVM_REQ_CLOCK_UPDATE. But not yet. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-5-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3263147a166f..5a0ff6a07431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -751,8 +751,7 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - struct gfn_to_hva_cache pv_time; - bool pv_time_enabled; + struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; -- cgit v1.2.3 From 7caf9571563eade546976d600dd023674b1c3185 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:15 +0000 Subject: KVM: x86/xen: Use gfn_to_pfn_cache for vcpu_info Currently, the fast path of kvm_xen_set_evtchn_fast() doesn't set the index bits in the target vCPU's evtchn_pending_sel, because it only has a userspace virtual address with which to do so. It just sets them in the kernel, and kvm_xen_has_interrupt() then completes the delivery to the actual vcpu_info structure when the vCPU runs. Using a gfn_to_pfn_cache allows kvm_xen_set_evtchn_fast() to do the full delivery in the common case. Clean up the fallback case too, by moving the deferred delivery out into a separate kvm_xen_inject_pending_events() function which isn't ever called in atomic contexts as __kvm_xen_has_interrupt() is. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-6-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a0ff6a07431..fce4dca16e5b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,9 +606,8 @@ struct kvm_vcpu_hv { struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; - bool vcpu_info_set; bool vcpu_time_info_set; - struct gfn_to_hva_cache vcpu_info_cache; + struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_hva_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; u64 last_steal; -- cgit v1.2.3 From 69d413cfcf77920bb4d7c4bbfbf305781fa53a0e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:16 +0000 Subject: KVM: x86/xen: Use gfn_to_pfn_cache for vcpu_time_info This switches the final pvclock to kvm_setup_pvclock_pfncache() and now the old kvm_setup_pvclock_page() can be removed. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-7-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fce4dca16e5b..7808491ebba8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,9 +606,8 @@ struct kvm_vcpu_hv { struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; - bool vcpu_time_info_set; struct gfn_to_pfn_cache vcpu_info_cache; - struct gfn_to_hva_cache vcpu_time_info_cache; + struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; u64 last_steal; u64 runstate_entry_time; -- cgit v1.2.3 From 2fd6df2f2b47d4301b1ee0fe9d627d1c061a5988 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 3 Mar 2022 15:41:19 +0000 Subject: KVM: x86/xen: intercept EVTCHNOP_send from guests Userspace registers a sending @port to either deliver to an @eventfd or directly back to a local event channel port. After binding events the guest or host may wish to bind those events to a particular vcpu. This is usually done for unbound and and interdomain events. Update requests are handled via the KVM_XEN_EVTCHN_UPDATE flag. Unregistered ports are handled by the emulator. Co-developed-by: Ankur Arora Co-developed-By: David Woodhouse Signed-off-by: Joao Martins Signed-off-by: Ankur Arora Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-10-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7808491ebba8..b20f7d99d702 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1024,6 +1024,7 @@ struct kvm_xen { bool long_mode; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; + struct idr evtchn_ports; }; enum kvm_irqchip_mode { -- cgit v1.2.3 From 942c2490c23f2800ad8143f5eb84a79b859aa743 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:21 +0000 Subject: KVM: x86/xen: Add KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID In order to intercept hypercalls such as VCPUOP_set_singleshot_timer, we need to be aware of the Xen CPU numbering. This looks a lot like the Hyper-V handling of vpidx, for obvious reasons. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-12-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b20f7d99d702..9e3408542b41 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -613,6 +613,7 @@ struct kvm_vcpu_xen { u64 runstate_entry_time; u64 runstate_times[4]; unsigned long evtchn_pending_sel; + u32 vcpu_id; /* The Xen / ACPI vCPU ID */ }; struct kvm_vcpu_arch { -- cgit v1.2.3 From 536395260582be7443b0b35b0bbb89ffe3947f62 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 3 Mar 2022 15:41:22 +0000 Subject: KVM: x86/xen: handle PV timers oneshot mode If the guest has offloaded the timer virq, handle the following hypercalls for programming the timer: VCPUOP_set_singleshot_timer VCPUOP_stop_singleshot_timer set_timer_op(timestamp_ns) The event channel corresponding to the timer virq is then used to inject events once timer deadlines are met. For now we back the PV timer with hrtimer. [ dwmw2: Add save/restore, 32-bit compat mode, immediate delivery, don't check timer in kvm_vcpu_has_event() ] Signed-off-by: Joao Martins Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-13-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e3408542b41..8193d5285544 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -614,6 +614,10 @@ struct kvm_vcpu_xen { u64 runstate_times[4]; unsigned long evtchn_pending_sel; u32 vcpu_id; /* The Xen / ACPI vCPU ID */ + u32 timer_virq; + u64 timer_expires; /* In guest epoch */ + atomic_t timer_pending; + struct hrtimer timer; }; struct kvm_vcpu_arch { -- cgit v1.2.3 From 28d1629f751c4a5f9437fbaa0ee4ed81d1a8e587 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:23 +0000 Subject: KVM: x86/xen: Kernel acceleration for XENVER_version Turns out this is a fast path for PV guests because they use it to trigger the event channel upcall. So letting it bounce all the way up to userspace is not great. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-14-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8193d5285544..f3fba9d8ddc6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1026,6 +1026,7 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + u32 xen_version; bool long_mode; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; -- cgit v1.2.3 From fde0451be8fb3208d4d146b8602d99ee8139e515 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:24 +0000 Subject: KVM: x86/xen: Support per-vCPU event channel upcall via local APIC Windows uses a per-vCPU vector, and it's delivered via the local APIC basically like an MSI (with associated EOI) unlike the traditional guest-wide vector which is just magically asserted by Xen (and in the KVM case by kvm_xen_has_interrupt() / kvm_cpu_get_extint()). Now that the kernel is able to raise event channel events for itself, being able to do so for Windows guests is also going to be useful. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-15-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f3fba9d8ddc6..998caf7a3ce9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,6 +606,7 @@ struct kvm_vcpu_hv { struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; + u8 upcall_vector; struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; -- cgit v1.2.3 From 1a65105a5aba9f7c6ea68cf687e569d055a94685 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 3 Mar 2022 15:41:26 +0000 Subject: KVM: x86/xen: handle PV spinlocks slowpath Add support for SCHEDOP_poll hypercall. This implementation is optimized for polling for a single channel, which is what Linux does. Polling for multiple channels is not especially efficient (and has not been tested). PV spinlocks slow path uses this hypercall, and explicitly crash if it's not supported. [ dwmw2: Rework to use kvm_vcpu_halt(), not supported for 32-bit guests ] Signed-off-by: Boris Ostrovsky Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-17-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 998caf7a3ce9..5370744b789c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -619,6 +619,8 @@ struct kvm_vcpu_xen { u64 timer_expires; /* In guest epoch */ atomic_t timer_pending; struct hrtimer timer; + int poll_evtchn; + struct timer_list poll_timer; }; struct kvm_vcpu_arch { @@ -1032,6 +1034,7 @@ struct kvm_xen { u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; + unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; enum kvm_irqchip_mode { -- cgit v1.2.3 From ffbb61d09fc56c85e28b110494f3788d0ed4d1f8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 25 Feb 2022 14:53:02 +0000 Subject: KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl. This sets the default TSC frequency for subsequently created vCPUs. Signed-off-by: David Woodhouse Message-Id: <20220225145304.36166-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5370744b789c..fac990cc189d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1126,6 +1126,8 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; + u32 default_tsc_khz; + seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; -- cgit v1.2.3 From d5fa597ed87047117dc62ce1d38ba1d007442359 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 22 Mar 2022 19:40:49 +0200 Subject: KVM: x86: allow per cpu apicv inhibit reasons Add optional callback .vcpu_get_apicv_inhibit_reasons returning extra inhibit reasons that prevent APICv from working on this vCPU. Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Message-Id: <20220322174050.241850-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fac990cc189d..676705ad1e23 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1507,6 +1507,11 @@ struct kvm_x86_ops { int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); + + /* + * Returns vCPU specific APICv inhibit reasons + */ + unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { @@ -1807,6 +1812,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); -- cgit v1.2.3 From 8176472563fb1a233f46f3f0441738c82afd88da Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Mon, 28 Feb 2022 11:07:49 +0800 Subject: kvm: x86: Adjust the location of pkru_mask of kvm_mmu to reduce memory Adjust the field pkru_mask to the back of direct_map to make up 8-byte alignment.This reduces the size of kvm_mmu by 8 bytes. Signed-off-by: Peng Hao Message-Id: <20220228030749.88353-1-flyingpeng@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f1dfa066068d..e9c9a23d6623 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -443,14 +443,6 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; - struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; - - /* - * Bitmap; bit set = permission fault - * Byte index: page fault error code [4:1] - * Bit index: pte permissions in ACC_* format - */ - u8 permissions[16]; /* * The pkru_mask indicates if protection key checks are needed. It @@ -460,6 +452,15 @@ struct kvm_mmu { */ u32 pkru_mask; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; + + /* + * Bitmap; bit set = permission fault + * Byte index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u8 permissions[16]; + u64 *pae_root; u64 *pml4_root; u64 *pml5_root; -- cgit v1.2.3 From fdc298da866165ec0288fe227982f1aa0467bf5c Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:51 +0000 Subject: KVM: x86: Move kvm_ops_static_call_update() to x86.c The kvm_ops_static_call_update() is defined in kvm_host.h. That's completely unnecessary, it should have exactly one caller, kvm_arch_hardware_setup(). Move the helper to x86.c and have it do the actual memcpy() of the ops in addition to the static call updates. This will also allow for cleanly giving kvm_pmu_ops static_call treatment. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: Move memcpy() into the helper and rename accordingly] Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e9c9a23d6623..25d1aac74c55 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1563,20 +1563,6 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include -static inline void kvm_ops_static_call_update(void) -{ -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include -#undef __KVM_X86_OP -} - #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { -- cgit v1.2.3 From 34886e796c413273908b036df57cc9ae731badae Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 29 Mar 2022 23:50:53 +0000 Subject: KVM: x86: Move .pmu_ops to kvm_x86_init_ops and tag as __initdata The pmu_ops should be moved to kvm_x86_init_ops and tagged as __initdata. That'll save those precious few bytes, and more importantly make the original ops unreachable, i.e. make it harder to sneak in post-init modification bugs. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20220329235054.3534728-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 25d1aac74c55..e1c695f72b8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1465,8 +1465,6 @@ struct kvm_x86_ops { int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); - /* pmu operations of sub-arch */ - const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); @@ -1542,6 +1540,7 @@ struct kvm_x86_init_ops { unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; + struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { -- cgit v1.2.3 From 6819af7597d87d40769b47bb377472877a6b56c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 3 Mar 2022 20:20:17 -0800 Subject: KVM: x86: Clean up and document nested #PF workaround Replace the per-vendor hack-a-fix for KVM's #PF => #PF => #DF workaround with an explicit, common workaround in kvm_inject_emulated_page_fault(). Aside from being a hack, the current approach is brittle and incomplete, e.g. nSVM's KVM_SET_NESTED_STATE fails to set ->inject_page_fault(), and nVMX fails to apply the workaround when VMX is intercepting #PF due to allow_smaller_maxphyaddr=1. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e1c695f72b8b..e46bd289e5df 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1516,6 +1516,8 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); int (*check_events)(struct kvm_vcpu *vcpu); + bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu, + struct x86_exception *fault); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From e5ed0fb01004f93ddf8a0c632cbc8a3f1ea5b518 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 11 Feb 2022 06:50:11 -0500 Subject: KVM: x86/mmu: split cpu_role from mmu_role Snapshot the state of the processor registers that govern page walk into a new field of struct kvm_mmu. This is a more natural representation than having it *mostly* in mmu_role but not exclusively; the delta right now is represented in other fields, such as root_level. The nested MMU now has only the CPU role; and in fact the new function kvm_calc_cpu_role is analogous to the previous kvm_calc_nested_mmu_role, except that it has role.base.direct equal to !CR0.PG. For a walk-only MMU, "direct" has no meaning, but we set it to !CR0.PG so that role.ext.cr0_pg can go away in a future patch. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e46bd289e5df..50edf52a3ef6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -438,6 +438,7 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); struct kvm_mmu_root_info root; + union kvm_mmu_role cpu_role; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; -- cgit v1.2.3 From ec283cb1dcb934d1a861e5e25ce843e033ee4ab7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 07:32:51 -0500 Subject: KVM: x86/mmu: remove ept_ad field The ept_ad field is used during page walk to determine if the guest PTEs have accessed and dirty bits. In the MMU role, the ad_disabled bit represents whether the *shadow* PTEs have the bits, so it would be incorrect to replace PT_HAVE_ACCESSED_DIRTY with just !mmu->mmu_role.base.ad_disabled. However, the similar field in the CPU mode, ad_disabled, is initialized correctly: to the opposite value of ept_ad for shadow EPT, and zero for non-EPT guest paging modes (which always have A/D bits). It is therefore possible to compute PT_HAVE_ACCESSED_DIRTY from the CPU mode, like other page-format fields; it just has to be inverted to account for the different polarity. In fact, now that the CPU mode is distinct from the MMU roles, it would even be possible to remove PT_HAVE_ACCESSED_DIRTY macro altogether, and use !mmu->cpu_role.base.ad_disabled instead. I am not doing this because the macro has a small effect in terms of dead code elimination: text data bss dec hex 103544 16665 112 120321 1d601 # as of this patch 103746 16665 112 120523 1d6cb # without PT_HAVE_ACCESSED_DIRTY Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 50edf52a3ef6..a299236cfde5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -442,7 +442,6 @@ struct kvm_mmu { union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; - u8 ept_ad; bool direct_map; /* -- cgit v1.2.3 From 7a458f0e1ba150a6ea012171a43c4b947f1d825d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Feb 2022 08:46:24 -0500 Subject: KVM: x86/mmu: remove extended bits from mmu_role, rename field mmu_role represents the role of the root of the page tables. It does not need any extended bits, as those govern only KVM's page table walking; the is_* functions used for page table walking always use the CPU role. ext.valid is not present anymore in the MMU role, but an all-zero MMU role is impossible because the level field is never zero in the MMU role. So just zap the whole mmu_role in order to force invalidation after CPUID is updated. While making this change, which requires touching almost every occurrence of "mmu_role", rename it to "root_role". Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a299236cfde5..c81221d03a1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -439,7 +439,7 @@ struct kvm_mmu { void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); struct kvm_mmu_root_info root; union kvm_mmu_role cpu_role; - union kvm_mmu_role mmu_role; + union kvm_mmu_page_role root_role; u8 root_level; u8 shadow_root_level; bool direct_map; -- cgit v1.2.3 From 7a7ae8292391c4d53c4340e606bf48776c3449e7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 07:38:32 -0500 Subject: KVM: x86/mmu: rename kvm_mmu_role union It is quite confusing that the "full" union is called kvm_mmu_role but is used for the "cpu_role" field of struct kvm_mmu. Rename it to kvm_cpu_role. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c81221d03a1b..6bc5550ae530 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -281,7 +281,7 @@ struct kvm_kernel_irq_routing_entry; /* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in - * the given MMU context. This is a subset of the overall kvm_mmu_role to + * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating * 2 bytes per gfn instead of 4 bytes per gfn. * @@ -378,7 +378,7 @@ union kvm_mmu_extended_role { }; }; -union kvm_mmu_role { +union kvm_cpu_role { u64 as_u64; struct { union kvm_mmu_page_role base; @@ -438,7 +438,7 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); struct kvm_mmu_root_info root; - union kvm_mmu_role cpu_role; + union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; u8 root_level; u8 shadow_root_level; -- cgit v1.2.3 From faf729621c9609367a0714f5383df67fdd8d021c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 07:38:51 -0500 Subject: KVM: x86/mmu: remove redundant bits from extended role Before the separation of the CPU and the MMU role, CR0.PG was not available in the base MMU role, because two-dimensional paging always used direct=1 in the MMU role. However, now that the raw role is snapshotted in mmu->cpu_role, the value of CR0.PG always matches both !cpu_role.base.direct and cpu_role.base.level > 0. There is no need to store it again in union kvm_mmu_extended_role; instead, write an is_cr0_pg accessor by hand that takes care of the conversion. Use cpu_role.base.level since the future of the direct field is unclear. Likewise, CR4.PAE is now always present in the CPU role as !cpu_role.base.has_4_byte_gpte. The inversion makes certain tests on the MMU role easier, and is easily hidden by the is_cr4_pae accessor when operating on the CPU role. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6bc5550ae530..52ceeadbed28 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -367,8 +367,6 @@ union kvm_mmu_extended_role { struct { unsigned int valid:1; unsigned int execonly:1; - unsigned int cr0_pg:1; - unsigned int cr4_pae:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; -- cgit v1.2.3 From a972e29c1d6c957242a23b42f355b1fdf721cbd4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 07:41:19 -0500 Subject: KVM: x86/mmu: replace shadow_root_level with root_role.level root_role.level is always the same value as shadow_level: - it's kvm_mmu_get_tdp_level(vcpu) when going through init_kvm_tdp_mmu - it's the level argument when going through kvm_init_shadow_ept_mmu - it's assigned directly from new_role.base.level when going through shadow_mmu_init_context Remove the duplication and get the level directly from the role. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52ceeadbed28..35056341799d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -439,7 +439,6 @@ struct kvm_mmu { union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; u8 root_level; - u8 shadow_root_level; bool direct_map; /* -- cgit v1.2.3 From 4d25502aa12ef1fb01e599cbfd341a8d436f4b8b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 07:42:22 -0500 Subject: KVM: x86/mmu: replace root_level with cpu_role.base.level Remove another duplicate field of struct kvm_mmu. This time it's the root level for page table walking; the separate field is always initialized as cpu_role.base.level, so its users can look up the CPU mode directly instead. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35056341799d..aea39fe03a99 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -438,7 +438,6 @@ struct kvm_mmu { struct kvm_mmu_root_info root; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; - u8 root_level; bool direct_map; /* -- cgit v1.2.3 From 347a0d0ded16a2e59c35b43ace7ad2b53fb6df57 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Feb 2022 08:00:56 -0500 Subject: KVM: x86/mmu: replace direct_map with root_role.direct direct_map is always equal to the direct field of the root page's role: - for shadow paging, direct_map is true if CR0.PG=0 and root_role.direct is copied from cpu_role.base.direct - for TDP, it is always true and root_role.direct is also always true - for shadow TDP, it is always false and root_role.direct is also always false Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aea39fe03a99..752a6d2357ce 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -438,7 +438,6 @@ struct kvm_mmu { struct kvm_mmu_root_info root; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; - bool direct_map; /* * The pkru_mask indicates if protection key checks are needed. It -- cgit v1.2.3 From 84e5ffd045f33e4fa32370135436d987478d0bf7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Apr 2022 21:12:04 +0800 Subject: KVM: X86/MMU: Fix shadowing 5-level NPT for 4-level NPT L1 guest When shadowing 5-level NPT for 4-level NPT L1 guest, the root_sp is allocated with role.level = 5 and the guest pagetable's root gfn. And root_sp->spt[0] is also allocated with the same gfn and the same role except role.level = 4. Luckily that they are different shadow pages, but only root_sp->spt[0] is the real translation of the guest pagetable. Here comes a problem: If the guest switches from gCR4_LA57=0 to gCR4_LA57=1 (or vice verse) and uses the same gfn as the root page for nested NPT before and after switching gCR4_LA57. The host (hCR4_LA57=1) might use the same root_sp for the guest even the guest switches gCR4_LA57. The guest will see unexpected page mapped and L2 may exploit the bug and hurt L1. It is lucky that the problem can't hurt L0. And three special cases need to be handled: The root_sp should be like role.direct=1 sometimes: its contents are not backed by gptes, root_sp->gfns is meaningless. (For a normal high level sp in shadow paging, sp->gfns is often unused and kept zero, but it could be relevant and meaningful if sp->gfns is used because they are backed by concrete gptes.) For such root_sp in the case, root_sp is just a portal to contribute root_sp->spt[0], and root_sp->gfns should not be used and root_sp->spt[0] should not be dropped if gpte[0] of the guest root pagetable is changed. Such root_sp should not be accounted too. So add role.passthrough to distinguish the shadow pages in the hash when gCR4_LA57 is toggled and fix above special cases by using it in kvm_mmu_page_{get|set}_gfn() and sp_has_gptes(). Signed-off-by: Lai Jiangshan Message-Id: <20220420131204.2850-3-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 752a6d2357ce..f164c6c1514a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -285,7 +285,7 @@ struct kvm_kernel_irq_routing_entry; * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating * 2 bytes per gfn instead of 4 bytes per gfn. * - * Indirect upper-level shadow pages are tracked for write-protection via + * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise * gfn_track will overflow and explosions will ensure. @@ -331,7 +331,8 @@ union kvm_mmu_page_role { unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; - unsigned :6; + unsigned passthrough:1; + unsigned :5; /* * This is left at the top of the word so that -- cgit v1.2.3 From 1075d41efd598d3fd4d52a1e1116b20979975135 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:49 +0000 Subject: KVM: x86/mmu: Expand and clean up page fault stats Expand and clean up the page fault stats. The current stats are at best incomplete, and at worst misleading. Differentiate between faults that are actually fixed vs those that result in an MMIO SPTE being created, track faults that are spurious, faults that trigger emulation, faults that that are fixed in the fast path, and last but not least, track the number of faults that are taken. Note, the number of faults that require emulation for write-protected shadow pages can roughly be calculated by subtracting the number of MMIO SPTEs created from the overall number of faults that trigger emulation. Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f164c6c1514a..c5fb4115176d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1269,7 +1269,12 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; + u64 pf_taken; u64 pf_fixed; + u64 pf_emulate; + u64 pf_spurious; + u64 pf_fast; + u64 pf_mmio_spte_created; u64 pf_guest; u64 tlb_flush; u64 invlpg; -- cgit v1.2.3