From 19c9610e5a02a23e40cf94b3f8038f2d6d1763ac Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Mon, 21 Aug 2023 17:55:33 +0800
Subject: [PATCH 1/4] mm/tlbbatch: introduce arch_tlbbatch_should_defer()

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7U78A
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=76b1e6fa09c2317547ed423a0870bc28299d6f55

-------------------------------------------

Patch series "arm64: support batched/deferred tlb shootdown during page
reclamation/migration", v11.

Though ARM64 has the hardware to do tlb shootdown, the hardware
broadcasting is not free.  A simplest micro benchmark shows even on
snapdragon 888 with only 8 cores, the overhead for ptep_clear_flush is
huge even for paging out one page mapped by only one process: 5.36% a.out
[kernel.kallsyms] [k] ptep_clear_flush

While pages are mapped by multiple processes or HW has more CPUs, the cost
should become even higher due to the bad scalability of tlb shootdown.
The same benchmark can result in 16.99% CPU consumption on ARM64 server
with around 100 cores according to the test on patch 4/4.

This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by
1. only send tlbi instructions in the first stage -
	arch_tlbbatch_add_mm()
2. wait for the completion of tlbi by dsb while doing tlbbatch
	sync in arch_tlbbatch_flush()

Testing on snapdragon shows the overhead of ptep_clear_flush is removed by
the patchset.  The micro benchmark becomes 5% faster even for one page
mapped by single process on snapdragon 888.

Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset
does some renaming/extension for the current implementation first (Patch
1-3), then add the support on arm64 (Patch 4).

This patch (of 4):

The entire scheme of deferred TLB flush in reclaim path rests on the fact
that the cost to refill TLB entries is less than flushing out individual
entries by sending IPI to remote CPUs.  But architecture can have
different ways to evaluate that.  Hence apart from checking
TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be
architecture specific.

[yangyicong@hisilicon.com: rebase and fix incorrect return value type]
Link: https://lkml.kernel.org/r/20230717131004.12662-1-yangyicong@huawei.com
Link: https://lkml.kernel.org/r/20230717131004.12662-2-yangyicong@huawei.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khandual@linux.vnet.ibm.com/]
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Punit Agrawal <punit.agrawal@bytedance.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Darren Hart <darren@os.amperecomputing.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: lipeifeng <lipeifeng@oppo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <namit@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 arch/x86/include/asm/tlbflush.h | 12 ++++++++++++
 mm/rmap.c                       |  9 +--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fa952eadbc2e..c1d8df34c6c6 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -239,6 +239,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+	bool should_defer = false;
+
+	/* If remote CPUs need to be flushed then defer batch the flush */
+	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+		should_defer = true;
+	put_cpu();
+
+	return should_defer;
+}
+
 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
 	/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e12d26d8c55..d4803e04b00d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -652,17 +652,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
  */
 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 {
-	bool should_defer = false;
-
 	if (!(flags & TTU_BATCH_FLUSH))
 		return false;
 
-	/* If remote CPUs need to be flushed then defer batch the flush */
-	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
-		should_defer = true;
-	put_cpu();
-
-	return should_defer;
+	return arch_tlbbatch_should_defer(mm);
 }
 
 /*
-- 
Gitee


From e03b7ff7222a7923b4bd17c1f83a673580b3685a Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Mon, 21 Aug 2023 17:55:34 +0800
Subject: [PATCH 2/4] mm/tlbbatch: rename and extend some functions

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7U78A
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=653f88d06395a9b4b63e5835ca3ed74df802e786

-------------------------------------------

This patch does some preparation works to extend batched TLB flush to
arm64. Including:
- Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm()
  to accept an additional argument for address, architectures
  like arm64 may need this for tlbi.
- Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending()
  to match its current function since we don't need to handle
  mm on architectures like arm64 and add_mm is not proper,
  add_pending will make sense to both as on x86 we're pending the
  TLB flush operations while on arm64 we're pending the synchronize
  operations.

This intends no functional changes on x86.

Link: https://lkml.kernel.org/r/20230717131004.12662-3-yangyicong@huawei.com
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Punit Agrawal <punit.agrawal@bytedance.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Nadav Amit <namit@vmware.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Barry Song <baohua@kernel.org>
Cc: Darren Hart <darren@os.amperecomputing.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: lipeifeng <lipeifeng@oppo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Conflicts:
	mm/rmap.c

Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 arch/x86/include/asm/tlbflush.h |  5 +++--
 include/linux/mm_types_task.h   |  4 ++--
 mm/rmap.c                       | 10 ++++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c1d8df34c6c6..1404f43a5986 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -262,8 +262,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
-					struct mm_struct *mm)
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
 {
 	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..78bbded3b13f 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -77,8 +77,8 @@ struct tlbflush_unmap_batch {
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	/*
 	 * The arch code makes the following promise: generic code can modify a
-	 * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
-	 * needed barriers), then call arch_tlbbatch_flush(), and the entries
+	 * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+	 * all needed barriers), then call arch_tlbbatch_flush(), and the entries
 	 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
 	 * returns.
 	 */
diff --git a/mm/rmap.c b/mm/rmap.c
index d4803e04b00d..38eafa951fe0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -623,11 +623,12 @@ void try_to_unmap_flush_dirty(void)
 		try_to_unmap_flush();
 }
 
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable,
+						unsigned long uaddr)
 {
 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 
-	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required = true;
 
 	/*
@@ -687,7 +688,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 	}
 }
 #else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable,
+						unsigned long uaddr)
 {
 }
 
@@ -1566,7 +1568,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
 
-			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval), address);
 		} else {
 			pteval = ptep_clear_flush(vma, address, pvmw.pte);
 		}
-- 
Gitee


From 98a397d8182f3b779ee146840c4bbdd9d56201ae Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 21 Aug 2023 17:55:35 +0800
Subject: [PATCH 3/4] mm/tlbbatch: introduce arch_flush_tlb_batched_pending()

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7U78A
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=4fd62a4892a3f576f424eb86e4eda510038934a9

-------------------------------------------

Currently we'll flush the mm in flush_tlb_batched_pending() to avoid race
between reclaim unmaps pages by batched TLB flush and mprotect/munmap/etc.
Other architectures like arm64 may only need a synchronization
barrier(dsb) here rather than a full mm flush.  So add
arch_flush_tlb_batched_pending() to allow an arch-specific implementation
here.  This intends no functional changes on x86 since still a full mm
flush for x86.

Link: https://lkml.kernel.org/r/20230717131004.12662-4-yangyicong@huawei.com
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Barry Song <baohua@kernel.org>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Darren Hart <darren@os.amperecomputing.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: lipeifeng <lipeifeng@oppo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xin Hao <xhao@linux.alibaba.com>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Conflicts:
	mm/rmap.c
	arch/x86/include/asm/tlbflush.h

Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 arch/x86/include/asm/tlbflush.h | 5 +++++
 mm/rmap.c                       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1404f43a5986..1f1e5add0b97 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -270,6 +270,11 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
 }
 
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+	flush_tlb_mm(mm);
+}
+
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 #endif /* !MODULE */
diff --git a/mm/rmap.c b/mm/rmap.c
index 38eafa951fe0..816db3edc116 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -677,7 +677,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 void flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	if (data_race(mm->tlb_flush_batched)) {
-		flush_tlb_mm(mm);
+		arch_flush_tlb_batched_pending(mm);
 
 		/*
 		 * Do not allow the compiler to re-order the clearing of
-- 
Gitee


From 2e5df9c01966719a91652834032d72a09442d7c1 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Mon, 21 Aug 2023 17:55:36 +0800
Subject: [PATCH 4/4] arm64: support batched/deferred tlb shootdown during page
 reclamation/migration

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7U78A
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=f443875e380307071e832bc79fe90ae0382c1aa2

-------------------------------------------

On x86, batched and deferred tlb shootdown has lead to 90% performance
increase on tlb shootdown.  on arm64, HW can do tlb shootdown without
software IPI.  But sync tlbi is still quite expensive.

Even running a simplest program which requires swapout can
prove this is true,
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <string.h>

 int main()
 {
 #define SIZE (1 * 1024 * 1024)
         volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                          MAP_SHARED | MAP_ANONYMOUS, -1, 0);

         memset(p, 0x88, SIZE);

         for (int k = 0; k < 10000; k++) {
                 /* swap in */
                 for (int i = 0; i < SIZE; i += 4096) {
                         (void)p[i];
                 }

                 /* swap out */
                 madvise(p, SIZE, MADV_PAGEOUT);
         }
 }

Perf result on snapdragon 888 with 8 cores by using zRAM
as the swap block device.

 ~ # perf record taskset -c 4 ./a.out
 [ perf record: Woken up 10 times to write data ]
 [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ]
 ~ # perf report
 # To display the perf.data header info, please use --header/--header-only options.
 # To display the perf.data header info, please use --header/--header-only options.
 #
 #
 # Total Lost Samples: 0
 #
 # Samples: 60K of event 'cycles'
 # Event count (approx.): 35706225414
 #
 # Overhead  Command  Shared Object      Symbol
 # ........  .......  .................  ......
 #
    21.07%  a.out    [kernel.kallsyms]  [k] _raw_spin_unlock_irq
     8.23%  a.out    [kernel.kallsyms]  [k] _raw_spin_unlock_irqrestore
     6.67%  a.out    [kernel.kallsyms]  [k] filemap_map_pages
     6.16%  a.out    [kernel.kallsyms]  [k] __zram_bvec_write
     5.36%  a.out    [kernel.kallsyms]  [k] ptep_clear_flush
     3.71%  a.out    [kernel.kallsyms]  [k] _raw_spin_lock
     3.49%  a.out    [kernel.kallsyms]  [k] memset64
     1.63%  a.out    [kernel.kallsyms]  [k] clear_page
     1.42%  a.out    [kernel.kallsyms]  [k] _raw_spin_unlock
     1.26%  a.out    [kernel.kallsyms]  [k] mod_zone_state.llvm.8525150236079521930
     1.23%  a.out    [kernel.kallsyms]  [k] xas_load
     1.15%  a.out    [kernel.kallsyms]  [k] zram_slot_lock

ptep_clear_flush() takes 5.36% CPU in the micro-benchmark swapping in/out
a page mapped by only one process.  If the page is mapped by multiple
processes, typically, like more than 100 on a phone, the overhead would be
much higher as we have to run tlb flush 100 times for one single page.
Plus, tlb flush overhead will increase with the number of CPU cores due to
the bad scalability of tlb shootdown in HW, so those ARM64 servers should
expect much higher overhead.

Further perf annonate shows 95% cpu time of ptep_clear_flush is actually
used by the final dsb() to wait for the completion of tlb flush.  This
provides us a very good chance to leverage the existing batched tlb in
kernel.  The minimum modification is that we only send async tlbi in the
first stage and we send dsb while we have to sync in the second stage.

With the above simplest micro benchmark, collapsed time to finish the
program decreases around 5%.

Typical collapsed time w/o patch:
 ~ # time taskset -c 4 ./a.out
 0.21user 14.34system 0:14.69elapsed
w/ patch:
 ~ # time taskset -c 4 ./a.out
 0.22user 13.45system 0:13.80elapsed

Also tested with benchmark in the commit on Kunpeng920 arm64 server
and observed an improvement around 12.5% with command
`time ./swap_bench`.
        w/o             w/
real    0m13.460s       0m11.771s
user    0m0.248s        0m0.279s
sys     0m12.039s       0m11.458s

Originally it's noticed a 16.99% overhead of ptep_clear_flush()
which has been eliminated by this patch:

[root@localhost yang]# perf record -- ./swap_bench && perf report
[...]
16.99%  swap_bench  [kernel.kallsyms]  [k] ptep_clear_flush

It is tested on 4,8,128 CPU platforms and shows to be beneficial on
large systems but may not have improvement on small systems like on
a 4 CPU platform.

Also this patch improve the performance of page migration. Using pmbench
and tries to migrate the pages of pmbench between node 0 and node 1 for
100 times for 1G memory, this patch decrease the time used around 20%
(prev 18.338318910 sec after 13.981866350 sec) and saved the time used
by ptep_clear_flush().

Link: https://lkml.kernel.org/r/20230717131004.12662-5-yangyicong@huawei.com
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Punit Agrawal <punit.agrawal@bytedance.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Nadav Amit <namit@vmware.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Barry Song <baohua@kernel.org>
Cc: Darren Hart <darren@os.amperecomputing.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: lipeifeng <lipeifeng@oppo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Conflicts:
	Documentation/features/vm/TLB/arch-support.txt
	arch/arm64/Kconfig

fix kabi breakage for mm_struct->tlb_flush_batched
and task_struct->tlb_ubc.

Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
 .../features/vm/TLB/arch-support.txt          |  2 +-
 arch/arm64/Kconfig                            |  1 +
 arch/arm64/include/asm/tlbbatch.h             | 12 +++++
 arch/arm64/include/asm/tlbflush.h             | 44 +++++++++++++++++--
 include/linux/mm_types.h                      |  4 +-
 include/linux/mm_types_task.h                 | 25 ++++++++++-
 include/linux/sched.h                         |  3 ++
 mm/rmap.c                                     | 14 ++++--
 8 files changed, 96 insertions(+), 9 deletions(-)
 create mode 100644 arch/arm64/include/asm/tlbbatch.h

diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 30f75a79ce01..1ccf586c7810 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -9,7 +9,7 @@
     |       alpha: | TODO |
     |         arc: | TODO |
     |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
     |         c6x: |  ..  |
     |        csky: | TODO |
     |       h8300: |  ..  |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 16620146c49a..139d98a7a12d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..fedb0b87b8db
--- /dev/null
+++ b/arch/arm64/include/asm/tlbbatch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_ARM64_TLBBATCH_H
+#define _ARCH_ARM64_TLBBATCH_H
+
+struct arch_tlbflush_unmap_batch {
+	/*
+	 * For arm64, HW can do tlb shootdown, so we don't
+	 * need to record cpumask for sending IPI
+	 */
+};
+
+#endif /* _ARCH_ARM64_TLBBATCH_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 36f02892e1df..4c28c6c4acba 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	dsb(ish);
 }
 
-static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
-					 unsigned long uaddr)
+static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
+					   unsigned long uaddr)
 {
 	unsigned long addr;
 
 	dsb(ishst);
-	addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
+	addr = __TLBI_VADDR(uaddr, ASID(mm));
 	__tlbi(vale1is, addr);
 	__tlbi_user(vale1is, addr);
 }
 
+static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
+					 unsigned long uaddr)
+{
+	return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
+}
+
 static inline void flush_tlb_page(struct vm_area_struct *vma,
 				  unsigned long uaddr)
 {
@@ -272,6 +278,38 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 	dsb(ish);
 }
 
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
+	/*
+	 * TLB flush deferral is not required on systems which are affected by
+	 * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
+	 * will have two consecutive TLBI instructions with a dsb(ish) in between
+	 * defeating the purpose (i.e save overall 'dsb ish' cost).
+	 */
+	if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI)))
+		return false;
+#endif
+	return true;
+}
+
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	__flush_tlb_page_nosync(mm, uaddr);
+}
+
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+	dsb(ish);
+}
+
+static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+	dsb(ish);
+}
+
 /*
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
  * necessarily a performance improvement.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e3eaf458787a..d1be389c0468 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -598,7 +598,7 @@ struct mm_struct {
 		 * moving a PROT_NONE or PROT_NUMA mapped page.
 		 */
 		atomic_t tlb_flush_pending;
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && !defined(CONFIG_ARM64)
 		/* See flush_tlb_batched_pending() */
 		bool tlb_flush_batched;
 #endif
@@ -620,6 +620,8 @@ struct mm_struct {
 
 #if defined(CONFIG_X86_64)
 	KABI_USE(1, struct mm_struct_extend *mm_extend)
+#elif defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+	KABI_USE(1, bool tlb_flush_batched)
 #else
 	KABI_RESERVE(1)
 #endif
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 78bbded3b13f..e293d7037bfa 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -74,7 +74,7 @@ struct page_frag {
 
 /* Track pages that require TLB flushes */
 struct tlbflush_unmap_batch {
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && !defined(CONFIG_ARM64)
 	/*
 	 * The arch code makes the following promise: generic code can modify a
 	 * PTE, then call arch_tlbbatch_add_pending() (which internally provides
@@ -96,4 +96,27 @@ struct tlbflush_unmap_batch {
 #endif
 };
 
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+struct tlbflush_unmap_batch_arm64 {
+	/*
+	 * The arch code makes the following promise: generic code can modify a
+	 * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+	 * all needed barriers), then call arch_tlbbatch_flush(), and the entries
+	 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
+	 * returns.
+	 */
+	struct arch_tlbflush_unmap_batch arch;
+
+	/* True if a flush is needed. */
+	bool flush_required;
+
+	/*
+	 * If true then the PTE was dirty when unmapped. The entry must be
+	 * flushed before IO is initiated or a stale TLB entry potentially
+	 * allows an update without redirtying the page.
+	 */
+	bool writable;
+};
+#endif
+
 #endif /* _LINUX_MM_TYPES_TASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdd3324cc858..5e413d309e77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -685,6 +685,9 @@ struct task_struct_resvd {
 #ifdef CONFIG_MMU
 	struct timer_list	oom_reaper_timer;
 #endif
+#if defined(CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH) && defined(CONFIG_ARM64)
+	struct tlbflush_unmap_batch_arm64       tlb_ubc;
+#endif
 };
 
 struct task_struct {
diff --git a/mm/rmap.c b/mm/rmap.c
index 816db3edc116..150803a7ffb5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,14 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 }
 
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+
+#ifdef CONFIG_ARM64
+#define DEFINE_TLB_UBC(name) struct tlbflush_unmap_batch_arm64 *name = \
+				&current->_resvd->tlb_ubc
+#else
+#define DEFINE_TLB_UBC(name) struct tlbflush_unmap_batch *name = &current->tlb_ubc
+#endif
+
 /*
  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
  * important if a PTE was dirty when it was unmapped that it's flushed
@@ -604,7 +612,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
  */
 void try_to_unmap_flush(void)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
 	if (!tlb_ubc->flush_required)
 		return;
@@ -617,7 +625,7 @@ void try_to_unmap_flush(void)
 /* Flush iff there are potentially writable TLB entries that can race with IO */
 void try_to_unmap_flush_dirty(void)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
 	if (tlb_ubc->writable)
 		try_to_unmap_flush();
@@ -626,7 +634,7 @@ void try_to_unmap_flush_dirty(void)
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable,
 						unsigned long uaddr)
 {
-	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	DEFINE_TLB_UBC(tlb_ubc);
 
 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
 	tlb_ubc->flush_required = true;
-- 
Gitee