From c15e40a796bbc4ee855d85c0578ddd4d960dd113 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Mon, 13 Nov 2023 17:51:40 +0800
Subject: [PATCH 1/3] loongarch/kvm: Remove SW timer switch when vcpu is halt
 polling

LoongArch inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8I8NK

------------------------------------------

This patches removes SW timer switch during vcpu block stage. VM uses HW
timer rather than SW PV timer on LoongArch system, it can check pending
HW timer interrupt status directly, rather than switch to SW timer and
check injected SW timer interrupt.

When SW timer is not used in vcpu halt-polling mode, the relative
SW timer handling before entering guest can be removed also. Timer
emulation is simpler than before, SW timer emuation is only used in vcpu
thread context switch.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
 arch/loongarch/include/asm/kvm_host.h |  3 +-
 arch/loongarch/kvm/emulate.c          | 15 +------
 arch/loongarch/kvm/kvmcpu.h           |  1 -
 arch/loongarch/kvm/loongarch.c        | 50 +++++++--------------
 arch/loongarch/kvm/timer.c            | 65 +++++++++------------------
 5 files changed, 41 insertions(+), 93 deletions(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index d4af44d481c3..510d01f1b998 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -244,6 +244,7 @@ struct kvm_vcpu_arch {
 	u64 perf_ctrl[4];
 	u64 perf_cntr[4];
 
+	int blocking;
 };
 
 static inline unsigned long readl_sw_gcsr(struct loongarch_csrs *csr, int reg)
@@ -325,8 +326,6 @@ static inline void kvm_arch_free_memslot(struct kvm *kvm,
 					struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
-static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
-static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 extern int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu);
diff --git a/arch/loongarch/kvm/emulate.c b/arch/loongarch/kvm/emulate.c
index 01bafcb3abb3..6a68dd57ecd1 100644
--- a/arch/loongarch/kvm/emulate.c
+++ b/arch/loongarch/kvm/emulate.c
@@ -24,20 +24,9 @@ int _kvm_emu_idle(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.idle_exits;
 	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_IDLE);
-	if (!vcpu->arch.irq_pending) {
-		kvm_save_timer(vcpu);
-		kvm_vcpu_block(vcpu);
-
-		/*
-		 * We we are runnable, then definitely go off to user space to
-		 * check if any I/O interrupts are pending.
-		 */
-		if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		}
-	}
 
+	kvm_vcpu_block(vcpu);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 	return EMULATE_DONE;
 }
 
diff --git a/arch/loongarch/kvm/kvmcpu.h b/arch/loongarch/kvm/kvmcpu.h
index 78f83a1a8400..7bcaaa254d16 100644
--- a/arch/loongarch/kvm/kvmcpu.h
+++ b/arch/loongarch/kvm/kvmcpu.h
@@ -99,7 +99,6 @@ void kvm_restore_lasx_upper(struct kvm_vcpu *cpu);
 void kvm_lose_hw_perf(struct kvm_vcpu *vcpu);
 void kvm_restore_hw_perf(struct kvm_vcpu *vcpu);
 
-void kvm_acquire_timer(struct kvm_vcpu *vcpu);
 void kvm_reset_timer(struct kvm_vcpu *vcpu);
 void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz);
 void kvm_restore_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/loongarch/kvm/loongarch.c b/arch/loongarch/kvm/loongarch.c
index 01e28c05ac11..7a4c7fcf5bef 100644
--- a/arch/loongarch/kvm/loongarch.c
+++ b/arch/loongarch/kvm/loongarch.c
@@ -246,7 +246,6 @@ int kvm_arch_hardware_enable(void)
 	 */
 	gcfg |= KVM_GCFG_GCI_SECURE;
 	gcfg |= KVM_GCFG_MATC_ROOT;
-	gcfg |= KVM_GCFG_TIT;
 	kvm_write_csr_gcfg(gcfg);
 	kvm_flush_tlb_all();
 
@@ -457,9 +456,6 @@ static int _kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
-	if (ret == RESUME_GUEST)
-		kvm_acquire_timer(vcpu);
-
 	if (!(ret & RESUME_HOST)) {
 		_kvm_deliver_intr(vcpu);
 		/* Only check for signals if not already exiting to userspace */
@@ -652,7 +648,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	smp_store_mb(vcpu->mode, IN_GUEST_MODE);
 
 	cpu = smp_processor_id();
-	kvm_acquire_timer(vcpu);
 	/* Check if we have any exceptions/interrupts pending */
 	_kvm_deliver_intr(vcpu);
 
@@ -698,23 +693,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-/**
- * kvm_migrate_count() - Migrate timer.
- * @vcpu:       Virtual CPU.
- *
- * Migrate hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_migrate_count(struct kvm_vcpu *vcpu)
-{
-	if (hrtimer_cancel(&vcpu->arch.swtimer))
-		hrtimer_restart(&vcpu->arch.swtimer);
-}
-
 static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvm_context *context;
@@ -822,22 +800,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	local_irq_save(flags);
 	vcpu->cpu = cpu;
-	if (vcpu->arch.last_sched_cpu != cpu) {
-		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
-				vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
-		/*
-		 * Migrate the timer interrupt to the current CPU so that it
-		 * always interrupts the guest and synchronously triggers a
-		 * guest timer interrupt.
-		 */
-		kvm_migrate_count(vcpu);
-	}
 
 	/* restore guest state to registers */
 	_kvm_vcpu_load(vcpu, cpu);
 	local_irq_restore(flags);
 }
 
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.blocking = 1;
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.blocking = 0;
+}
+
 static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct loongarch_csrs *csr = vcpu->arch.csr;
@@ -1712,9 +1690,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	return _kvm_pending_timer(vcpu) ||
+	int ret;
+
+	/* protect from TOD sync and vcpu_load/put */
+	preempt_disable();
+	ret = _kvm_pending_timer(vcpu) ||
 		kvm_read_hw_gcsr(KVM_CSR_ESTAT) &
 			(1 << (KVM_INT_TIMER - KVM_INT_START));
+	preempt_enable();
+	return ret;
 }
 
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c
index 04e156e82965..8d0ddbe56aee 100644
--- a/arch/loongarch/kvm/timer.c
+++ b/arch/loongarch/kvm/timer.c
@@ -97,6 +97,12 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
 		return;
 	}
 
+	/*
+	 * Freeze the soft-timer and sync the guest stable timer with it. We do
+	 * this with interrupts disabled to avoid latency.
+	 */
+	hrtimer_cancel(&vcpu->arch.swtimer);
+
 	/*
 	 * set remainder tick value if not expired
 	 */
@@ -113,8 +119,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
 			delta = 0;
 		/*
 		 * inject timer here though sw timer should inject timer
-		 * interrupt async already, since sw timer may be cancelled
-		 * during injecting intr async in function kvm_acquire_timer
+		 * interrupt async already
 		 */
 		_kvm_queue_irq(vcpu, LARCH_INT_TIMER);
 	}
@@ -122,31 +127,6 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
 	kvm_write_gcsr_timertick(delta);
 }
 
-/*
- *
- * Restore hard timer state and enable guest to access timer registers
- * without trap
- *
- * it is called with irq disabled
- */
-void kvm_acquire_timer(struct kvm_vcpu *vcpu)
-{
-	unsigned long cfg;
-
-	cfg = kvm_read_csr_gcfg();
-	if (!(cfg & CSR_GCFG_TIT))
-		return;
-
-	/* enable guest access to hard timer */
-	kvm_write_csr_gcfg(cfg & ~CSR_GCFG_TIT);
-
-	/*
-	 * Freeze the soft-timer and sync the guest stable timer with it. We do
-	 * this with interrupts disabled to avoid latency.
-	 */
-	hrtimer_cancel(&vcpu->arch.swtimer);
-}
-
 /*
  * Save guest timer state and switch to software emulation of guest
  * timer. The hard timer must already be in use, so preemption should be
@@ -168,14 +148,17 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu)
 		 * HRTIMER_MODE_PINNED is suggested since vcpu may run in
 		 * the same physical cpu in next time
 		 */
-		hrtimer_cancel(&vcpu->arch.swtimer);
 		hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED);
-	} else
+	} else if (vcpu->arch.blocking) {
 		/*
-		 * inject timer interrupt so that hall polling can dectect
-		 * and exit
+		 * Inject timer interrupt so that hall polling can dectect and exit
+		 * kvm_queue_irq is not enough, hrtimer had better be used since vcpu
+		 * is halt-polling and scheduled out already
 		 */
-		_kvm_queue_irq(vcpu, LARCH_INT_TIMER);
+		expire = ktime_add_ns(ktime_get(), 10);  // 10ns is enough here
+		vcpu->arch.expire = expire;
+		hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED);
+	}
 }
 
 /*
@@ -185,20 +168,14 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu)
 void kvm_save_timer(struct kvm_vcpu *vcpu)
 {
 	struct loongarch_csrs *csr = vcpu->arch.csr;
-	unsigned long cfg;
 
 	preempt_disable();
-	cfg = kvm_read_csr_gcfg();
-	if (!(cfg & CSR_GCFG_TIT)) {
-		/* disable guest use of hard timer */
-		kvm_write_csr_gcfg(cfg | CSR_GCFG_TIT);
-
-		/* save hard timer state */
-		kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG);
-		kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL);
-		if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN)
-			_kvm_save_timer(vcpu);
-	}
+
+	/* save hard timer state */
+	kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG);
+	kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL);
+	if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN)
+		_kvm_save_timer(vcpu);
 
 	/* save timer-related state to vCPU context */
 	kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT);
-- 
Gitee


From 1bddc4cd7d7185cf6ffce08f679da05bab720168 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Thu, 16 Nov 2023 14:36:54 +0800
Subject: [PATCH 2/3] loongarch/kvm: Fix oneshot timer emulation

LoongArch inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8I3QU

------------------------------------------

When oneshot timer is fired, CSR TVAL will be -1 rather than 0.
It is not remaining timer ticks for expired time. There needs
special handing for this situation.

With this patch, runltp with version ltp20230516 passes to run
in vm.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
 arch/loongarch/kvm/timer.c | 58 ++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c
index 8d0ddbe56aee..4e16f3bc5335 100644
--- a/arch/loongarch/kvm/timer.c
+++ b/arch/loongarch/kvm/timer.c
@@ -84,6 +84,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
 	struct loongarch_csrs *csr = vcpu->arch.csr;
 	ktime_t expire, now;
 	unsigned long cfg, delta, period;
+	unsigned long ticks, estat;
 
 	/*
 	 * Set guest stable timer cfg csr
@@ -103,20 +104,44 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
 	 */
 	hrtimer_cancel(&vcpu->arch.swtimer);
 
+	/*
+	 * From LoongArch Reference Manual Volume 1 Chapter 7.6.2
+	 * If oneshot timer is fired, CSR TVAL will be -1, there are two
+	 * conditions:
+	 *  a) timer is fired during exiting to host
+	 *  b) timer is fired and vm is handling timer irq, host should not
+	 *     inject timer irq to avoid spurious timer interrupt
+	 */
+	ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL);
+	estat = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+	if (!(cfg & CSR_TCFG_PERIOD) && (ticks > cfg)) {
+		/*
+		 * Writing 0 to LOONGARCH_CSR_TVAL will inject timer irq
+		 * and set CSR TVAL with -1
+		 *
+		 * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear
+		 * timer interrupt, and set CSR TVAL keeps unchanged with -1,
+		 * it avoids spurious timer interrupt
+		 */
+		kvm_write_gcsr_timertick(0);
+		if ((estat & CPU_TIMER) == 0)
+			kvm_gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR);
+		return;
+	}
+
 	/*
 	 * set remainder tick value if not expired
 	 */
 	now = ktime_get();
 	expire = vcpu->arch.expire;
+	delta = 0;
 	if (ktime_before(now, expire))
 		delta = ktime_to_tick(vcpu, ktime_sub(expire, now));
-	else {
-		if (cfg & CSR_TCFG_PERIOD) {
-			period = cfg & CSR_TCFG_VAL;
-			delta = ktime_to_tick(vcpu, ktime_sub(now, expire));
-			delta = period - (delta % period);
-		} else
-			delta = 0;
+	else if (cfg & CSR_TCFG_PERIOD) {
+		period = cfg & CSR_TCFG_VAL;
+		delta = ktime_to_tick(vcpu, ktime_sub(now, expire));
+		delta = period - (delta % period);
+
 		/*
 		 * inject timer here though sw timer should inject timer
 		 * interrupt async already
@@ -134,20 +159,29 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
  */
 static void _kvm_save_timer(struct kvm_vcpu *vcpu)
 {
-	unsigned long ticks, delta;
+	unsigned long ticks, delta, cfg;
 	ktime_t expire;
 	struct loongarch_csrs *csr = vcpu->arch.csr;
 
 	ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL);
-	delta = tick_to_ns(vcpu, ticks);
-	expire = ktime_add_ns(ktime_get(), delta);
-	vcpu->arch.expire = expire;
-	if (ticks) {
+	cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG);
+
+	/*
+	 * From LoongArch Reference Manual Volume 1 Chapter 7.6.2
+	 * If period timer is fired, CSR TVAL will be reloaded from CSR TCFG
+	 * If oneshot timer is fired, CSR TVAL will be -1
+	 * Here judge one shot timer fired by checking whether TVAL is larger
+	 * than TCFG
+	 */
+	if (ticks < cfg) {
 		/*
 		 * Update hrtimer to use new timeout
 		 * HRTIMER_MODE_PINNED is suggested since vcpu may run in
 		 * the same physical cpu in next time
 		 */
+		delta = tick_to_ns(vcpu, ticks);
+		expire = ktime_add_ns(ktime_get(), delta);
+		vcpu->arch.expire = expire;
 		hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED);
 	} else if (vcpu->arch.blocking) {
 		/*
-- 
Gitee


From 9f05b474aaaccf4a845c3774a1a677609d476aed Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Mon, 13 Nov 2023 20:16:23 +0800
Subject: [PATCH 3/3] loongarch/kvm: disable hugepage during migration process

LoongArch inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8I4BU

------------------------------------------

When VM is migrating, it will record dirty page and copy new page
to destination vm. With huge page dirty huge page is 32M, it
causes vm faling to migrate.

Hugepage is disabled during vm migration.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
---
 arch/loongarch/kvm/mmu.c | 93 ++++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index 4e4c1ddabb5d..68427f576e20 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -27,6 +27,18 @@
 #define KVM_MMU_CACHE_MIN_PAGES 2
 #endif
 
+static inline int kvm_pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE; }
+static inline pte_t kvm_pte_mksmall(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_HUGE;
+	return pte;
+}
+
+static inline void kvm_set_pte(pte_t *ptep, pte_t val)
+{
+	WRITE_ONCE(*ptep, val);
+}
+
 static int kvm_tlb_flush_gpa(struct kvm_vcpu *vcpu, unsigned long gpa)
 {
 	preempt_disable();
@@ -893,7 +905,7 @@ static int kvm_set_pmd_huge(struct kvm_vcpu *vcpu, struct kvm_mmu_memory_cache
 		pmd_clear(pmd);
 	}
 
-	kvm_tlb_flush_gpa(vcpu, addr & PMD_MASK);
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	set_pmd(pmd, *new_pmd);
 	return 0;
 }
@@ -950,14 +962,16 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, unsigned long *gpap)
 }
 
 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
-					       unsigned long hva,
-					       unsigned long map_size)
+					unsigned long hva, bool write)
 {
 	gpa_t gpa_start;
 	hva_t uaddr_start, uaddr_end;
+	unsigned long map_size;
 	size_t size;
 
-	if (memslot->arch.flags & KVM_MEMSLOT_DISABLE_THP)
+	map_size = PMD_SIZE;
+	/* Disable dirty logging on HugePages */
+	if ((memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) && write)
 		return false;
 
 	size = memslot->npages * PAGE_SIZE;
@@ -1012,9 +1026,6 @@ static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
  * @vcpu:		VCPU pointer.
  * @gpa:		Guest physical address of fault.
  * @write:	Whether the fault was due to a write.
- * @out_entry:		New PTE for @gpa (written on success unless NULL).
- * @out_buddy:		New PTE for @gpa's buddy (written on success unless
- *			NULL).
  *
  * Perform fast path GPA fault handling, doing all that can be done without
  * calling into KVM. This handles marking old pages young (for idle page
@@ -1026,8 +1037,7 @@ static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
  *		read-only page, in which case KVM must be consulted.
  */
 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
-				   bool write,
-				   pte_t *out_entry, pte_t *out_buddy)
+				   bool write)
 {
 	struct kvm *kvm = vcpu->kvm;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -1035,6 +1045,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 	kvm_pfn_t pfn = 0;	/* silence bogus GCC warning */
 	bool pfn_valid = false;
 	int ret = 0;
+	struct kvm_memory_slot *slot;
 
 	spin_lock(&kvm->mmu_lock);
 
@@ -1058,6 +1069,18 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 			goto out;
 		}
 
+		if (kvm_pte_huge(*ptep)) {
+			/*
+			 * Do not set write permission when dirty logging is
+			 * enabled for HugePages
+			 */
+			slot = gfn_to_memslot(kvm, gfn);
+			if (slot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+
 		/* Track dirtying of writeable pages */
 		set_pte(ptep, pte_mkdirty(*ptep));
 		pfn = pte_pfn(*ptep);
@@ -1072,11 +1095,6 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 		kvm_set_pfn_dirty(pfn);
 	}
 
-	if (out_entry)
-		*out_entry = *ptep;
-	if (out_buddy)
-		*out_buddy = *ptep_buddy(ptep);
-
 out:
 	spin_unlock(&kvm->mmu_lock);
 	if (pfn_valid)
@@ -1084,14 +1102,35 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 	return ret;
 }
 
+/*
+ * Split huge page
+ */
+static pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, pte_t *ptep, gfn_t gfn,
+		struct vm_area_struct *vma, unsigned long hva)
+{
+	int i;
+	pte_t val, *child;
+	struct kvm_mmu_memory_cache *memcache;
+
+	memcache = &vcpu->arch.mmu_page_cache;
+	child = kvm_mmu_memory_cache_alloc(memcache);
+	val = kvm_pte_mksmall(*ptep);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		kvm_set_pte(child + i, val);
+		pte_val(val) += PAGE_SIZE;
+	}
+
+	/* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
+	pte_val(val) = (unsigned long)child;
+	kvm_set_pte(ptep, val);
+	return child + (gfn & (PTRS_PER_PTE - 1));
+}
+
 /**
  * kvm_map_page() - Map a guest physical page.
  * @vcpu:		VCPU pointer.
  * @gpa:		Guest physical address of fault.
  * @write:	Whether the fault was due to a write.
- * @out_entry:		New PTE for @gpa (written on success unless NULL).
- * @out_buddy:		New PTE for @gpa's buddy (written on success unless
- *			NULL).
  *
  * Handle GPA faults by creating a new GPA mapping (or updating an existing
  * one).
@@ -1109,8 +1148,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
  *		as an MMIO access.
  */
 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
-			     bool write,
-			     pte_t *out_entry, pte_t *out_buddy)
+			     bool write)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1134,8 +1172,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	/* Try the fast path to handle old / clean pages */
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	if ((exccode != KVM_EXCCODE_TLBRI) && (exccode != KVM_EXCCODE_TLBXI)) {
-		err = kvm_map_page_fast(vcpu, gpa, write, out_entry,
-					      out_buddy);
+		err = kvm_map_page_fast(vcpu, gpa, write);
 		if (!err)
 			goto out;
 	}
@@ -1156,8 +1193,9 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	}
 
 	vma_pagesize = vma_kernel_pagesize(vma);
+	if ((vma_pagesize == PMD_SIZE) &&
+		!fault_supports_huge_mapping(memslot, hva, write)) {
 
-	if (fault_supports_huge_mapping(memslot, hva, vma_pagesize)) {
 		force_pte = true;
 		vma_pagesize = PAGE_SIZE;
 		++vcpu->stat.huge_dec_exits;
@@ -1227,7 +1265,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 		 * aligned and that the block is contained within the memslot.
 		 */
 		++vcpu->stat.huge_thp_exits;
-		if (fault_supports_huge_mapping(memslot, hva, PMD_SIZE) &&
+		if (fault_supports_huge_mapping(memslot, hva, write) &&
 		    transparent_hugepage_adjust(&pfn, &gpa)) {
 			++vcpu->stat.huge_adjust_exits;
 			vma_pagesize = PMD_SIZE;
@@ -1272,13 +1310,12 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 
 		/* Ensure page tables are allocated */
 		ptep = kvm_pte_for_gpa(kvm, memcache, vma, hva, gpa);
+		if (ptep && kvm_pte_huge(*ptep) && write)
+			ptep = kvm_split_huge(vcpu, ptep, gfn, vma, hva);
+
 		set_pte(ptep, new_pte);
 
 		err = 0;
-		if (out_entry)
-			*out_entry = new_pte;
-		if (out_buddy)
-			*out_buddy = *ptep_buddy(&new_pte);
 	}
 
 	spin_unlock(&kvm->mmu_lock);
@@ -1294,7 +1331,7 @@ int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv,
 {
 	int ret;
 
-	ret = kvm_map_page(vcpu, badv, write, NULL, NULL);
+	ret = kvm_map_page(vcpu, badv, write);
 	if (ret)
 		return ret;
 
-- 
Gitee