diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 30f75a79ce014a12b0fd53a18979cfff557f7f2f..1ccf586c7810841593aec987e339b625a82c6745 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: |  ..  |
     |        csky: | TODO |
     |       h8300: |  ..  |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 16620146c49a9f1cedd208e3f74abc99b2ab5673..139d98a7a12d0391e824ff1520cfbae7ee08a293 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedb0b87b8db45dbb5228f41c587efbcff9ef004
--- /dev/null
+++ b/arch/arm64/include/asm/tlbbatch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_ARM64_TLBBATCH_H
+#define _ARCH_ARM64_TLBBATCH_H
+
+struct arch_tlbflush_unmap_batch {
+	/*
+	 * For arm64, HW can do tlb shootdown, so we don't
+	 * need to record cpumask for sending IPI
+	 */
+};
+
+#endif /* _ARCH_ARM64_TLBBATCH_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 36f02892e1df809085551f9a753a32ae91366a5f..4c28c6c4acba9b3a53cb8ddb2faf8cb2be529bd3 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	dsb(ish);
 }
 
-static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
-					 unsigned long uaddr)
+static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
+					   unsigned long uaddr)
 {
 	unsigned long addr;
 
 	dsb(ishst);
-	addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
+	addr = __TLBI_VADDR(uaddr, ASID(mm));
 	__tlbi(vale1is, addr);
 	__tlbi_user(vale1is, addr);
 }
 
+static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
+					 unsigned long uaddr)
+{
+	return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
+}
+
 static inline void flush_tlb_page(struct vm_area_struct *vma,
 				  unsigned long uaddr)
 {
@@ -272,6 +278,38 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 	dsb(ish);
 }
 
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
+	/*
+	 * TLB flush deferral is not required on systems which are affected by
+	 * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
+	 * will have two consecutive TLBI instructions with a dsb(ish) in between
+	 * defeating the purpose (i.e save overall 'dsb ish' cost).
+	 */
+	if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI)))
+		return false;
+#endif
+	return true;
+}
+
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	__flush_tlb_page_nosync(mm, uaddr);
+}
+
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+	dsb(ish);
+}
+
+static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+	dsb(ish);
+}
+
 /*
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
  * necessarily a performance improvement.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fa952eadbc2eb72e6b072d26c1c6b1b8dd1412f6..1f1e5add0b974810f0c5e98b537b8c488b832d3a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -239,6 +239,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+	bool should_defer = false;
+
+	/* If remote CPUs need to be flushed then defer batch the flush */
+	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+		should_defer = true;
+	put_cpu();
+
+	return should_defer;
+}
+
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
 	/*
@@ -250,13 +262,19 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
-					struct mm_struct *mm)
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
 {
 	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
 }
 
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+	flush_tlb_mm(mm);
+}
+
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 #endif /* !MODULE */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e3eaf458787a8891897247ca84e7507f1d0a564d..d1be389c04680e7469ecad7ce4a68830de092a3f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -598,7 +598,7 @@ struct mm_struct {
 		 * moving a PROT_NONE or PROT_NUMA mapped page.
 		 */
 		atomic_t tlb_flush_pending;
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && !defined(CONFIG_ARM64)
 		/* See flush_tlb_batched_pending() */
 		bool tlb_flush_batched;
 #endif
@@ -620,6 +620,8 @@ struct mm_struct {
 
 #if defined(CONFIG_X86_64)
 	KABI_USE(1, struct mm_struct_extend *mm_extend)
+#elif defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+	KABI_USE(1, bool tlb_flush_batched)
 #else
 	KABI_RESERVE(1)
 #endif
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125cbbeb99b2bbe0d4757fad6bf5031f..e293d7037bfaea199d1b767e489ac58e9d02b098 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -74,11 +74,11 @@ struct page_frag {
 
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && !defined(CONFIG_ARM64)
 	/*
 	 * The arch code makes the following promise: generic code can modify a
-	 * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
-	 * needed barriers), then call arch_tlbbatch_flush(), and the entries
+	 * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+	 * all needed barriers), then call arch_tlbbatch_flush(), and the entries
 	 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
 	 * returns.
 	 */
@@ -96,4 +96,27 @@ struct tlbflush_unmap_batch {
 #endif
 };
 
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+struct tlbflush_unmap_batch_arm64 {
+	/*
+	 * The arch code makes the following promise: generic code can modify a
+	 * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+	 * all needed barriers), then call arch_tlbbatch_flush(), and the entries
+	 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
+	 * returns.
+	 */
+	struct arch_tlbflush_unmap_batch arch;
+
+	/* True if a flush is needed. */
+	bool flush_required;
+
+	/*
+	 * If true then the PTE was dirty when unmapped. The entry must be
+	 * flushed before IO is initiated or a stale TLB entry potentially
+	 * allows an update without redirtying the page.
+	 */
+	bool writable;
+};
+#endif
+
 #endif /* _LINUX_MM_TYPES_TASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdd3324cc858b1b5b4ce302bddd6ba3e1dc0607d..5e413d309e7723973831b582da02f8ca173a2ce4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -685,6 +685,9 @@ struct task_struct_resvd {
 #ifdef CONFIG_MMU
 	struct timer_list	oom_reaper_timer;
 #endif
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+	struct tlbflush_unmap_batch_arm64       tlb_ubc;
+#endif
 };
 
 struct task_struct {
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e12d26d8c55e6e0ec0c8a784b4e3758a161af7a..b9990209747146986727bcaf8569d0c53b8fa07c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,14 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 }
 
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+
+#ifdef CONFIG_ARM64
+#define DEFINE_TLB_UBC(name) struct tlbflush_unmap_batch_arm64 *name = \
+									&current->_resvd->tlb_ubc
+#else
+#define DEFINE_TLB_UBC(name) struct tlbflush_unmap_batch *name = &current->tlb_ubc
+#endif
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -604,7 +612,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
  */
 void try_to_unmap_flush(void)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
 	if (!tlb_ubc->flush_required)
 		return;
@@ -617,17 +625,18 @@ void try_to_unmap_flush(void)
 /* Flush iff there are potentially writable TLB entries that can race with IO */
 void try_to_unmap_flush_dirty(void)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
 	if (tlb_ubc->writable)
 		try_to_unmap_flush();
 }
 
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable,
+						unsigned long uaddr)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
-	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required = true;
 
 	/*
@@ -652,17 +661,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
  */
 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 {
-	bool should_defer = false;
-
 	if (!(flags & TTU_BATCH_FLUSH))
 		return false;
 
-	/* If remote CPUs need to be flushed then defer batch the flush */
-	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
-		should_defer = true;
-	put_cpu();
-
-	return should_defer;
+	return arch_tlbbatch_should_defer(mm);
 }
 
 /*
@@ -683,7 +685,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 void flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	if (data_race(mm->tlb_flush_batched)) {
-		flush_tlb_mm(mm);
+		arch_flush_tlb_batched_pending(mm);
 
 		/*
 		 * Do not allow the compiler to re-order the clearing of
@@ -694,7 +696,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 	}
 }
 #else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable,
+						unsigned long uaddr)
 {
 }
 
@@ -1573,7 +1576,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
 
-			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval), address);
 		} else {
 			pteval = ptep_clear_flush(vma, address, pvmw.pte);
 		}