diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f171ab3d0d37cc91987c5e9929036f633d4b2d12..b299d2d57085d646972b65641991684a741ea925 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -407,11 +407,13 @@ struct kvm_vcpu_arch {
 	struct {
 		u64 last_steal;
 		gpa_t base;
+		u64 avg_steal;
 	} steal;
 
 	/* Guest PV sched state */
 	struct {
 		bool pv_unhalted;
+		bool preempted;
 		gpa_t base;
 	} pvsched;
 
@@ -645,12 +647,14 @@ long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu);
 void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted);
 long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu);
 
+extern bool pv_preempted_enable;
 static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
 {
 	vcpu_arch->pvsched.base = GPA_INVALID;
+	vcpu_arch->pvsched.preempted = false;
 }
 
-static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch)
+static inline bool kvm_arm_is_pvsched_valid(struct kvm_vcpu_arch *vcpu_arch)
 {
 	return (vcpu_arch->pvsched.base != GPA_INVALID);
 }
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 240edaa9eb50b955e9c322c0f5035d0dc049dd41..c37f6238c901f75c9532a047636087412859267f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -84,6 +84,15 @@ unsigned int twedel = 0;
 module_param(twedel, uint, S_IRUGO | S_IWUSR);
 #endif
 
+static const struct kernel_param_ops pv_preempted_enable_ops = {
+	.set = param_set_bool,
+	.get = param_get_bool,
+};
+
+bool pv_preempted_enable = true;
+MODULE_PARM_DESC(pv_preempted_enable, "bool");
+module_param_cb(pv_preempted_enable, &pv_preempted_enable_ops, &pv_preempted_enable, 0644);
+
 static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp);
 
 static const struct kernel_param_ops force_wfi_trap_ops = {
@@ -94,6 +103,13 @@ static const struct kernel_param_ops force_wfi_trap_ops = {
 bool force_wfi_trap;
 module_param_cb(force_wfi_trap, &force_wfi_trap_ops, &force_wfi_trap, 0644);
 
+/*
+ * Set guest_steal_time_thresh to 0 to effectively disable this feature.
+ * Note 1024 should be a good guess as it works fine in the real workload.
+ */
+static unsigned long __read_mostly guest_steal_time_thresh = 1024;
+module_param(guest_steal_time_thresh, ulong, 0644);
+
 static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp)
 {
 	struct kvm *kvm;
@@ -575,8 +591,20 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu_has_ptrauth(vcpu))
 		vcpu_ptrauth_disable(vcpu);
 
-	if (kvm_arm_is_pvsched_enabled(&vcpu->arch))
-		kvm_update_pvsched_preempted(vcpu, 0);
+	/*
+	 * When pv_preempted is changed from enabled to disabled, preempted
+	 * state will not be updated in kvm_arch_vcpu_put/load. So we must
+	 * update the preempted state to 0 for every vCPU in case some vCPUs'
+	 * preempted state will always be 1.
+	 */
+	if (kvm_arm_is_pvsched_valid(&vcpu->arch)) {
+		if (pv_preempted_enable)
+			kvm_update_pvsched_preempted(vcpu, 0);
+		else {
+			if (vcpu->arch.pvsched.preempted)
+				kvm_update_pvsched_preempted(vcpu, 0);
+		}
+	}
 
 #ifdef CONFIG_KVM_HISI_VIRT
 	kvm_hisi_dvmbm_load(vcpu);
@@ -600,8 +628,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
 	vcpu->cpu = -1;
 
-	if (kvm_arm_is_pvsched_enabled(&vcpu->arch))
-		kvm_update_pvsched_preempted(vcpu, 1);
+	if (kvm_arm_is_pvsched_valid(&vcpu->arch) && pv_preempted_enable) {
+		if (vcpu->arch.steal.avg_steal < guest_steal_time_thresh) {
+			kvm_update_pvsched_preempted(vcpu, 0);
+			trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id,
+				vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 0);
+		} else {
+			kvm_update_pvsched_preempted(vcpu, 1);
+			trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id,
+				vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 1);
+		}
+	}
 
 #ifdef CONFIG_KVM_HISI_VIRT
 	kvm_hisi_dvmbm_put(vcpu);
diff --git a/arch/arm64/kvm/pvsched.c b/arch/arm64/kvm/pvsched.c
index dc1768815467b41289a7ad5ff1231163f90896f4..9693415226d138ebd9efbf8c5f833aaad27a1675 100644
--- a/arch/arm64/kvm/pvsched.c
+++ b/arch/arm64/kvm/pvsched.c
@@ -34,6 +34,8 @@ void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted)
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	pagefault_enable();
+
+	vcpu->arch.pvsched.preempted = !!preempted;
 }
 
 long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index 920ac43077ad3b48023c917bdd2eb8fb925e22dc..9b6448e1b300e364e0b334971253a6a51f749cc2 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -19,6 +19,14 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
 	u64 steal = 0;
 	int idx;
 
+	/*
+	 * Because workloads change over time, we keep avg_steal as a floating
+	 * average which ends up weighing recent steal time more than old ones.
+	 */
+	vcpu->arch.steal.avg_steal +=
+		READ_ONCE(current->sched_info.run_delay) - vcpu->arch.steal.last_steal;
+	vcpu->arch.steal.avg_steal /= 2;
+
 	if (base == GPA_INVALID)
 		return;
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 167e26659a0ba9384f9f458834b2c0bd976828ab..458a2a554f4d297cbdabf3cef95f98939d4ddd73 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -504,6 +504,30 @@ TRACE_EVENT(kvm_test_age_hva,
 	TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
 );
 
+TRACE_EVENT(kvm_arm_set_vcpu_preempted,
+	TP_PROTO(unsigned int vcpu_id, u64 avg_steal, unsigned long thresh,
+		unsigned int update_preempted_value),
+	TP_ARGS(vcpu_id, avg_steal, thresh, update_preempted_value),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, vcpu_id)
+		__field(unsigned long long, avg_steal)
+		__field(unsigned long, thresh)
+		__field(unsigned int, update_preempted_value)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->avg_steal = avg_steal;
+		__entry->thresh = thresh;
+		__entry->update_preempted_value = update_preempted_value;
+	),
+
+	TP_printk("vcpu:%u  avg steal time:%llu  thresh:%lu  update_preempted_value:%u",
+		__entry->vcpu_id, __entry->avg_steal, __entry->thresh,
+		__entry->update_preempted_value)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */