diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f171ab3d0d37cc91987c5e9929036f633d4b2d12..b299d2d57085d646972b65641991684a741ea925 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -407,11 +407,13 @@ struct kvm_vcpu_arch { struct { u64 last_steal; gpa_t base; + u64 avg_steal; } steal; /* Guest PV sched state */ struct { bool pv_unhalted; + bool preempted; gpa_t base; } pvsched; @@ -645,12 +647,14 @@ long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu); void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted); long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu); +extern bool pv_preempted_enable; static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { vcpu_arch->pvsched.base = GPA_INVALID; + vcpu_arch->pvsched.preempted = false; } -static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch) +static inline bool kvm_arm_is_pvsched_valid(struct kvm_vcpu_arch *vcpu_arch) { return (vcpu_arch->pvsched.base != GPA_INVALID); } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 240edaa9eb50b955e9c322c0f5035d0dc049dd41..c37f6238c901f75c9532a047636087412859267f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -84,6 +84,15 @@ unsigned int twedel = 0; module_param(twedel, uint, S_IRUGO | S_IWUSR); #endif +static const struct kernel_param_ops pv_preempted_enable_ops = { + .set = param_set_bool, + .get = param_get_bool, +}; + +bool pv_preempted_enable = true; +MODULE_PARM_DESC(pv_preempted_enable, "bool"); +module_param_cb(pv_preempted_enable, &pv_preempted_enable_ops, &pv_preempted_enable, 0644); + static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops force_wfi_trap_ops = { @@ -94,6 +103,13 @@ static const struct kernel_param_ops force_wfi_trap_ops = { bool force_wfi_trap; module_param_cb(force_wfi_trap, &force_wfi_trap_ops, &force_wfi_trap, 0644); +/* + * Set guest_steal_time_thresh to 0 to effectively disable this feature. + * Note 1024 should be a good guess as it works fine in the real workload. + */ +static unsigned long __read_mostly guest_steal_time_thresh = 1024; +module_param(guest_steal_time_thresh, ulong, 0644); + static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp) { struct kvm *kvm; @@ -575,8 +591,20 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu_has_ptrauth(vcpu)) vcpu_ptrauth_disable(vcpu); - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) - kvm_update_pvsched_preempted(vcpu, 0); + /* + * When pv_preempted is changed from enabled to disabled, preempted + * state will not be updated in kvm_arch_vcpu_put/load. So we must + * update the preempted state to 0 for every vCPU in case some vCPUs' + * preempted state will always be 1. + */ + if (kvm_arm_is_pvsched_valid(&vcpu->arch)) { + if (pv_preempted_enable) + kvm_update_pvsched_preempted(vcpu, 0); + else { + if (vcpu->arch.pvsched.preempted) + kvm_update_pvsched_preempted(vcpu, 0); + } + } #ifdef CONFIG_KVM_HISI_VIRT kvm_hisi_dvmbm_load(vcpu); @@ -600,8 +628,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) - kvm_update_pvsched_preempted(vcpu, 1); + if (kvm_arm_is_pvsched_valid(&vcpu->arch) && pv_preempted_enable) { + if (vcpu->arch.steal.avg_steal < guest_steal_time_thresh) { + kvm_update_pvsched_preempted(vcpu, 0); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 0); + } else { + kvm_update_pvsched_preempted(vcpu, 1); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 1); + } + } #ifdef CONFIG_KVM_HISI_VIRT kvm_hisi_dvmbm_put(vcpu); diff --git a/arch/arm64/kvm/pvsched.c b/arch/arm64/kvm/pvsched.c index dc1768815467b41289a7ad5ff1231163f90896f4..9693415226d138ebd9efbf8c5f833aaad27a1675 100644 --- a/arch/arm64/kvm/pvsched.c +++ b/arch/arm64/kvm/pvsched.c @@ -34,6 +34,8 @@ void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted) srcu_read_unlock(&kvm->srcu, idx); pagefault_enable(); + + vcpu->arch.pvsched.preempted = !!preempted; } long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 920ac43077ad3b48023c917bdd2eb8fb925e22dc..9b6448e1b300e364e0b334971253a6a51f749cc2 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,6 +19,14 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; + /* + * Because workloads change over time, we keep avg_steal as a floating + * average which ends up weighing recent steal time more than old ones. + */ + vcpu->arch.steal.avg_steal += + READ_ONCE(current->sched_info.run_delay) - vcpu->arch.steal.last_steal; + vcpu->arch.steal.avg_steal /= 2; + if (base == GPA_INVALID) return; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 167e26659a0ba9384f9f458834b2c0bd976828ab..458a2a554f4d297cbdabf3cef95f98939d4ddd73 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -504,6 +504,30 @@ TRACE_EVENT(kvm_test_age_hva, TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); +TRACE_EVENT(kvm_arm_set_vcpu_preempted, + TP_PROTO(unsigned int vcpu_id, u64 avg_steal, unsigned long thresh, + unsigned int update_preempted_value), + TP_ARGS(vcpu_id, avg_steal, thresh, update_preempted_value), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned long long, avg_steal) + __field(unsigned long, thresh) + __field(unsigned int, update_preempted_value) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->avg_steal = avg_steal; + __entry->thresh = thresh; + __entry->update_preempted_value = update_preempted_value; + ), + + TP_printk("vcpu:%u avg steal time:%llu thresh:%lu update_preempted_value:%u", + __entry->vcpu_id, __entry->avg_steal, __entry->thresh, + __entry->update_preempted_value) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */