Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torvalds/linux.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2022-06-09 18:38:12 +0300
committerPaolo Bonzini <pbonzini@redhat.com>2022-06-09 18:38:12 +0300
commite15f5e6fa6ca1b3baf087314b2541afa935d00e7 (patch)
treef2b136922cb3ebd89da3a36742600ec30b4e4e69 /arch/x86/kvm/vmx
parente0f3f46e42064a51573914766897b4ab95d943e3 (diff)
parentb172862241b4849985c3e0e86cfb05d61e4a841d (diff)
Merge branch 'kvm-5.20-early'
s390: * add an interface to provide a hypervisor dump for secure guests * improve selftests to show tests x86: * Intel IPI virtualization * Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS * PEBS virtualization * Simplify PMU emulation by just using PERF_TYPE_RAW events * More accurate event reinjection on SVM (avoid retrying instructions) * Allow getting/setting the state of the speaker port data bit * Rewrite gfn-pfn cache refresh * Refuse starting the module if VM-Entry/VM-Exit controls are inconsistent * "Notify" VM exit
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h57
-rw-r--r--arch/x86/kvm/vmx/evmcs.c2
-rw-r--r--arch/x86/kvm/vmx/evmcs.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c12
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c210
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c15
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h2
-rw-r--r--arch/x86/kvm/vmx/vmcs.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c291
-rw-r--r--arch/x86/kvm/vmx/vmx.h66
10 files changed, 511 insertions, 146 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 3f430e218375..069d8d298e1d 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -6,6 +6,8 @@
#include "lapic.h"
#include "x86.h"
+#include "pmu.h"
+#include "cpuid.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -59,6 +62,7 @@ struct vmcs_config {
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
+ u64 cpu_based_3rd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
struct nested_vmx_msrs nested;
@@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void)
static inline bool cpu_has_load_ia32_efer(void)
{
- return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
}
static inline bool cpu_has_load_perf_global_ctrl(void)
{
- return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
}
static inline bool cpu_has_vmx_mpx(void)
{
- return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
- (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
}
static inline bool cpu_has_vmx_tpr_shadow(void)
@@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void)
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void)
cpu_has_vmx_posted_intr();
}
+static inline bool cpu_has_vmx_ipiv(void)
+{
+ return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
@@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void)
rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
- (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
@@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void)
return pt_mode == PT_MODE_HOST_GUEST;
}
+static inline bool vmx_pebs_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
+}
+
static inline u64 vmx_get_perf_capabilities(void)
{
- u64 perf_cap = 0;
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ u64 host_perf_cap = 0;
if (!enable_pmu)
- return perf_cap;
+ return 0;
if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
- perf_cap &= PMU_CAP_LBR_FMT;
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
- /*
- * Since counters are virtualized, KVM would support full
- * width counting unconditionally, even if the host lacks it.
- */
- return PMU_CAP_FW_WRITES | perf_cap;
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
}
static inline u64 vmx_supported_debugctl(void)
@@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void)
return debugctl;
}
+static inline bool cpu_has_notify_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 87e3dc10edf4..6a61b1ae7942 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
+ vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 8d70f9aea94b..f886a8ff0342 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
*/
#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
PIN_BASED_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
#define EVMCS1_UNSUPPORTED_2NDEXEC \
(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f5cb18e00e78..7d8cd0ebcc75 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2133,6 +2133,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
{
+ struct kvm *kvm = vmx->vcpu.kvm;
+
/*
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
* according to L0's settings (vmcs12 is irrelevant here). Host
@@ -2175,6 +2177,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
/*
* Set the MSR load/store lists to match L0's settings. Only the
* addresses are constant (for vmcs02), the counts can change based
@@ -2548,7 +2553,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_get_l2_tsc_multiplier(vcpu));
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
@@ -4610,7 +4615,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (kvm_has_tsc_control)
+ if (kvm_caps.has_tsc_control)
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
if (vmx->nested.l1_tpr_threshold != -1)
@@ -6112,6 +6117,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
case EXIT_REASON_ENCLS:
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+ case EXIT_REASON_NOTIFY:
+ /* Notify VM exit is not exposed to L1 */
+ return false;
default:
return true;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 37e9eb32e3d9..422f0a6562ac 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7};
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
+ struct kvm_pmc *pmc;
+ u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
int i;
+ pmu->fixed_ctr_ctrl = data;
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
u8 new_ctrl = fixed_ctrl_field(data, i);
- u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
- struct kvm_pmc *pmc;
-
- pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+ u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i);
if (old_ctrl == new_ctrl)
continue;
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_fixed_counter(pmc, new_ctrl, i);
+ reprogram_counter(pmc);
}
+}
- pmu->fixed_ctr_ctrl = data;
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED) {
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ } else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
}
/* function is called when global control register has been updated. */
@@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
{
int bit;
u64 diff = pmu->global_ctrl ^ data;
+ struct kvm_pmc *pmc;
pmu->global_ctrl = data;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- reprogram_counter(pmu, bit);
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
+ pmc = intel_pmc_idx_to_pmc(pmu, bit);
+ if (pmc)
+ reprogram_counter(pmc);
+ }
}
-static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
+static bool intel_hw_event_available(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
@@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
/* disable event that reported as not present by cpuid */
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
- return PERF_COUNT_HW_MAX + 1;
+ return false;
break;
}
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
-
- return intel_arch_events[i].event_type;
+ return true;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
-{
- if (pmc_idx < INTEL_PMC_IDX_FIXED)
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
- MSR_P6_EVNTSEL0);
- else {
- u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ if (pmu->version < 2)
+ return true;
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
- }
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
}
static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
@@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
-bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
-{
- /*
- * As a first step, a guest could only enable LBR feature if its
- * cpu model is the same as the host because the LBR registers
- * would be pass-through to the guest and they're model specific.
- */
- return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
-}
-
bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
{
struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
@@ -202,27 +196,45 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
return ret;
}
-static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr, bool host_initiated)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- int ret;
+ u64 perf_capabilities = vcpu->arch.perf_capabilities;
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_CORE_PERF_GLOBAL_STATUS:
case MSR_CORE_PERF_GLOBAL_CTRL:
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- ret = pmu->version > 1;
+ if (host_initiated)
+ return true;
+ return pmu->version > 1;
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (host_initiated)
+ return true;
+ return perf_capabilities & PERF_CAP_PEBS_FORMAT;
+ break;
+ case MSR_IA32_DS_AREA:
+ if (host_initiated)
+ return true;
+ return guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ break;
+ case MSR_PEBS_DATA_CFG:
+ if (host_initiated)
+ return true;
+ return (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
+ ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
break;
default:
- ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ if (host_initiated)
+ return true;
+ return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
intel_pmu_is_valid_lbr_msr(vcpu, msr);
break;
}
-
- return ret;
}
static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
@@ -361,6 +373,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0;
return 0;
+ case MSR_IA32_PEBS_ENABLE:
+ msr_info->data = pmu->pebs_enable;
+ return 0;
+ case MSR_IA32_DS_AREA:
+ msr_info->data = pmu->ds_area;
+ return 0;
+ case MSR_PEBS_DATA_CFG:
+ msr_info->data = pmu->pebs_data_cfg;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -395,7 +416,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_FIXED_CTR_CTRL:
if (pmu->fixed_ctr_ctrl == data)
return 0;
- if (!(data & 0xfffffffffffff444ull)) {
+ if (!(data & pmu->fixed_ctr_ctrl_mask)) {
reprogram_fixed_counters(pmu, data);
return 0;
}
@@ -421,6 +442,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (pmu->pebs_enable == data)
+ return 0;
+ if (!(data & pmu->pebs_enable_mask)) {
+ pmu->pebs_enable = data;
+ return 0;
+ }
+ break;
+ case MSR_IA32_DS_AREA:
+ if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
+ pmu->ds_area = data;
+ return 0;
+ case MSR_PEBS_DATA_CFG:
+ if (pmu->pebs_data_cfg == data)
+ return 0;
+ if (!(data & pmu->pebs_data_cfg_mask)) {
+ pmu->pebs_data_cfg = data;
+ return 0;
+ }
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -445,7 +489,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
if (!(data & reserved_bits)) {
- reprogram_gp_counter(pmc, data);
+ pmc->eventsel = data;
+ reprogram_counter(pmc);
return 0;
}
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
@@ -474,11 +519,11 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
-
- struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
+ u64 counter_mask;
+ int i;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -487,6 +532,11 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
+
+ vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_PMU_RO_MASK;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry || !vcpu->kvm->arch.enable_pmu)
@@ -498,13 +548,15 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (!pmu->version)
return;
- perf_get_x86_pmu_capability(&x86_pmu);
+ vcpu->arch.ia32_misc_enable_msr |= MSR_IA32_MISC_ENABLE_EMON;
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- x86_pmu.num_counters_gp);
- eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
+ kvm_pmu_cap.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width,
+ kvm_pmu_cap.bit_width_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
- eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
+ eax.split.mask_length = min_t(int, eax.split.mask_length,
+ kvm_pmu_cap.events_mask_len);
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -514,17 +566,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters =
min3(ARRAY_SIZE(fixed_pmc_events),
(size_t) edx.split.num_counters_fixed,
- (size_t) x86_pmu.num_counters_fixed);
- edx.split.bit_width_fixed = min_t(int,
- edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
+ (size_t)kvm_pmu_cap.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
+ kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
setup_fixed_pmc_eventsel(pmu);
}
- pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
- pmu->global_ctrl_mask = ~pmu->global_ctrl;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
+ counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ pmu->global_ctrl_mask = counter_mask;
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
@@ -546,15 +600,33 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
nested_vmx_pmu_refresh(vcpu,
- intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
+ intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, false));
- if (intel_pmu_lbr_is_compatible(vcpu))
+ if (cpuid_model_is_consistent(vcpu))
x86_perf_get_lbr(&lbr_desc->records);
else
lbr_desc->records.nr = 0;
if (lbr_desc->records.nr)
bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
+
+ if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_FORMAT) {
+ vcpu->arch.ia32_misc_enable_msr &= ~MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+ if (vcpu->arch.perf_capabilities & PERF_CAP_PEBS_BASELINE) {
+ pmu->pebs_enable_mask = counter_mask;
+ pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmu->fixed_ctr_ctrl_mask &=
+ ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+ }
+ pmu->pebs_data_cfg_mask = ~0xff00000full;
+ } else {
+ pmu->pebs_enable_mask =
+ ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ }
+ } else {
+ vcpu->arch.perf_capabilities &= ~PERF_CAP_PEBS_MASK;
+ }
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
@@ -719,8 +791,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
intel_pmu_release_guest_lbr_event(vcpu);
}
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
+{
+ struct kvm_pmc *pmc = NULL;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
+ X86_PMC_IDX_MAX) {
+ pmc = intel_pmc_idx_to_pmc(pmu, bit);
+
+ if (!pmc || !pmc_speculative_in_use(pmc) ||
+ !intel_pmc_is_enabled(pmc))
+ continue;
+
+ if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) {
+ pmu->host_cross_mapped_mask |=
+ BIT_ULL(pmc->perf_event->hw.idx);
+ }
+ }
+}
+
struct kvm_pmu_ops intel_pmu_ops __initdata = {
- .pmc_perf_hw_id = intel_pmc_perf_hw_id,
+ .hw_event_available = intel_hw_event_available,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 07e5fcf5a5aa..237a1f40f939 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -177,11 +177,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
local_irq_restore(flags);
}
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The default posted interrupt vector does nothing when
+ * invoked outside guest mode. Return whether a blocked vCPU
+ * can be the target of posted interrupts, as is the case when
+ * using either IPI virtualization or VT-d PI, so that the
+ * notification vector is switched to the one that calls
+ * back to the pi_wakeup_handler() function.
+ */
+ return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!vmx_can_use_vtd_pi(vcpu->kvm))
+ if (!vmx_needs_pi_wakeup(vcpu))
return;
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 9a45d5c9f116..26992076552e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -5,6 +5,8 @@
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
+#define PID_TABLE_ENTRY_VALID 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 2b9d7a7e83f7..ac290a44a693 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -50,6 +50,7 @@ struct vmcs_controls_shadow {
u32 pin;
u32 exec;
u32 secondary_exec;
+ u64 tertiary_exec;
};
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 553dd2317b9c..5e14e4c40007 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO);
module_param(enable_apicv, bool, S_IRUGO);
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO);
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);
@@ -386,18 +392,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault)
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+ vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
@@ -1712,7 +1720,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
return vmcs12->tsc_multiplier;
- return kvm_default_tsc_scaling_ratio;
+ return kvm_caps.default_tsc_scaling_ratio;
}
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -2237,7 +2245,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PMU_CAP_LBR_FMT) !=
(vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
return 1;
- if (!intel_pmu_lbr_is_compatible(vcpu))
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
return 1;
}
ret = kvm_set_msr_common(vcpu, msr_info);
@@ -2410,6 +2429,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
+static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrl(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
@@ -2418,8 +2446,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ int i;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ };
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
@@ -2439,7 +2485,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
@@ -2472,7 +2519,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_BUS_LOCK_DETECTION;
+ SECONDARY_EXEC_BUS_LOCK_DETECTION |
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
if (cpu_has_sgx())
opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
@@ -2502,15 +2550,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
} else if (vmx_cap->ept) {
- vmx_cap->ept = 0;
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
- vmx_cap->vpid) {
- vmx_cap->vpid = 0;
+ vmx_cap->vpid) {
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
+ }
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
+ u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
+
+ _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -2551,6 +2614,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_vmentry_control) < 0)
return -EIO;
+ for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
+ u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
+ u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
+
+ if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
+ continue;
+
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
+ _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ _vmentry_control &= ~n_ctrl;
+ _vmexit_control &= ~x_ctrl;
+ }
+
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
* can't be used due to an errata where VM Exit may incorrectly clear
@@ -2599,6 +2679,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
@@ -3853,6 +3934,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
}
}
@@ -4180,15 +4263,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- if (kvm_vcpu_apicv_active(vcpu))
- secondary_exec_controls_setbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- else
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ secondary_exec_controls_setbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+ } else {
+ secondary_exec_controls_clearbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
}
vmx_update_msr_bitmap_x2apic(vcpu);
@@ -4220,6 +4307,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+ return exec_control;
+}
+
/*
* Adjust a single secondary execution control bit to intercept/allow an
* instruction in the guest. This is usually done based on whether or not a
@@ -4362,13 +4463,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
return exec_control;
}
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+ return vmx_alloc_ipiv_pid_table(kvm);
+}
+
#define VMX_XSS_EXIT_BITMAP 0
static void init_vmcs(struct vcpu_vmx *vmx)
{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
if (nested)
nested_vmx_set_vmcs_shadowing_bitmap();
@@ -4385,6 +4521,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
@@ -4397,12 +4536,20 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
- if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+ }
+
+ if (!kvm_pause_in_guest(kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
}
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -4571,13 +4718,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- trace_kvm_inj_virq(irq);
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
@@ -5689,6 +5836,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5746,6 +5919,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
};
static const int kvm_vmx_max_exit_handlers =
@@ -5843,6 +6017,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
unsigned long cr4;
int efer_slot;
@@ -5856,9 +6031,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- secondary_exec_control = 0;
+
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
@@ -5958,9 +6140,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
pr_err("*** Control State ***\n");
- pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
- pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
- pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
vmcs_read32(EXCEPTION_BITMAP),
vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -6110,7 +6293,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
- exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY)) {
int ndata = 3;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -6702,9 +6886,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
- msrs = perf_guest_get_msrs(&nr_msrs);
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
if (!msrs)
return;
@@ -7080,6 +7269,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
return 0;
free_vmcs:
@@ -7416,6 +7609,13 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7428,7 +7628,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
- supported_xss = 0;
+ kvm_caps.supported_xss = 0;
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
@@ -7569,9 +7769,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
- kvm_tsc_scaling_ratio_frac_bits,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
@@ -7716,6 +7916,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
return supported & BIT(reason);
}
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = "kvm_intel",
@@ -7727,7 +7934,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+ .vcpu_precreate = vmx_vcpu_precreate,
.vcpu_create = vmx_vcpu_create,
.vcpu_free = vmx_vcpu_free,
.vcpu_reset = vmx_vcpu_reset,
@@ -7941,8 +8150,8 @@ static __init int hardware_setup(void)
}
if (!cpu_has_vmx_mpx())
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
- XFEATURE_MASK_BNDCSR);
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -8005,12 +8214,16 @@ static __init int hardware_setup(void)
if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
+
if (cpu_has_vmx_tsc_scaling())
- kvm_has_tsc_control = true;
+ kvm_caps.has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
@@ -8067,11 +8280,11 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_mce_cap_supported |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
- if (!enable_ept || !cpu_has_vmx_intel_pt())
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b98c7e96697a..71bcb486e73f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -94,7 +94,7 @@ union vmx_exit_reason {
#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
-bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
@@ -366,6 +366,8 @@ struct kvm_vmx {
unsigned int tss_addr;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
+ /* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+ u64 *pid_table;
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -456,35 +458,36 @@ static inline u8 vmx_get_rvi(void)
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
}
-#define BUILD_CONTROLS_SHADOW(lname, uname) \
-static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
-{ \
- if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
- vmcs_write32(uname, val); \
- vmx->loaded_vmcs->controls_shadow.lname = val; \
- } \
-} \
-static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
-{ \
- return vmcs->controls_shadow.lname; \
-} \
-static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
-{ \
- return __##lname##_controls_get(vmx->loaded_vmcs); \
-} \
-static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
-} \
-static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
+ vmcs_write##bits(uname, val); \
+ vmx->loaded_vmcs->controls_shadow.lname = val; \
+ } \
+} \
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
+{ \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
+} \
+static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
+} \
+static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
}
-BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
-BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
-BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
-BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
/*
* VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
@@ -580,4 +583,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
return (vmx_instr_info >> 28) & 0xf;
}
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
#endif /* __KVM_X86_VMX_H */