From cdb4b4ae3c7fa3dcc2fb1c0c55968e681251bc25 Mon Sep 17 00:00:00 2001
From: Zheng Zengkai <zhengzengkai@huawei.com>
Date: Wed, 18 Dec 2024 17:40:25 +0800
Subject: [PATCH] performance test for kabi preserve 202412192113

Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 ...ude-msi-modify-kabi-size-of-msi_desc.patch |   45 +
 ...ss-of-superblock-s-initialized-flags.patch |   40 +
 ...e-CONFIG_CMA-by-default-in-openeuler.patch |   61 +
 ...ect-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch |   35 +
 ...r-free-of-nreq-in-reqsk_timer_handle.patch |   60 +
 ...erve-padding-for-uapi-struct-bpf_lin.patch |   63 +
 ...tra-KABI-entry-for-struct-iopf_group.patch |   38 +
 ...e-kabi-KABI-reservation-for-seq_file.patch |   45 +
 ...tatx-kabi-KABI-reservation-for-kstat.patch |   38 +
 ...-fine-grained-control-of-folio-sizes.patch |  200 +
 ...cgroup-fix-uaf-when-proc_cpuset_show.patch |   68 +
 ...ations-on-the-cgroup-root_list-RCU-s.patch |  145 +
 ..._head-up-near-the-top-of-cgroup_root.patch |   84 +
 ...uset-Prevent-UAF-in-proc_cpuset_show.patch |  110 +
 0021-cgroup-add-more-reserve-kabi.patch       |   90 +
 0022-14223.patch                              |   80 +
 0023-14224.patch                              |   85 +
 0024-14225.patch                              |  154 +
 0026-14227.patch                              | 3464 +++++++++++++++++
 kernel.spec                                   |   46 +-
 20 files changed, 4949 insertions(+), 2 deletions(-)
 create mode 100644 0005-include-msi-modify-kabi-size-of-msi_desc.patch
 create mode 100644 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
 create mode 100644 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
 create mode 100644 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
 create mode 100644 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
 create mode 100644 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
 create mode 100644 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
 create mode 100644 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
 create mode 100644 0015-statx-kabi-KABI-reservation-for-kstat.patch
 create mode 100644 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
 create mode 100644 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
 create mode 100644 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
 create mode 100644 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
 create mode 100644 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
 create mode 100644 0021-cgroup-add-more-reserve-kabi.patch
 create mode 100644 0022-14223.patch
 create mode 100644 0023-14224.patch
 create mode 100644 0024-14225.patch
 create mode 100644 0026-14227.patch

diff --git a/0005-include-msi-modify-kabi-size-of-msi_desc.patch b/0005-include-msi-modify-kabi-size-of-msi_desc.patch
new file mode 100644
index 0000000..79c77ab
--- /dev/null
+++ b/0005-include-msi-modify-kabi-size-of-msi_desc.patch
@@ -0,0 +1,45 @@
+From 723d41836db7669ab658d3e07c62fcbe17d7d7f4 Mon Sep 17 00:00:00 2001
+From: zhengjunlong <zhengjunlong@huawei.com>
+Date: Fri, 11 Oct 2024 17:08:35 +0800
+Subject: [PATCH 01/17] include/msi: modify kabi size of msi_desc
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAW8JF
+
+----------------------------------------------------
+
+Change the size of the pre-embedded memory for msi_desc to 40 bytes.
+
+Signed-off-by: Zheng Junlong <zhengjunlong@huawei.com>
+---
+ include/linux/msi.h | 11 ++++-------
+ 1 file changed, 4 insertions(+), 7 deletions(-)
+
+diff --git a/include/linux/msi.h b/include/linux/msi.h
+index 7354ffb14856..5fd8a6caae98 100644
+--- a/include/linux/msi.h
++++ b/include/linux/msi.h
+@@ -205,15 +205,12 @@ struct msi_desc {
+ 	union {
+ 		struct pci_msi_desc	pci;
+ 		struct msi_desc_data	data;
+-		KABI_RESERVE(1)
+-		KABI_RESERVE(2)
+-		KABI_RESERVE(3)
+-		KABI_RESERVE(4)
++		KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5)
+ 	};
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
+-	KABI_RESERVE(6)
+-	KABI_RESERVE(7)
+-	KABI_RESERVE(8)
+ };
+ 
+ /*
+-- 
+2.25.1
+
diff --git a/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
new file mode 100644
index 0000000..1d3c32f
--- /dev/null
+++ b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
@@ -0,0 +1,40 @@
+From e68e6e3cf90ec8fb7893057c768d55e83855aaa0 Mon Sep 17 00:00:00 2001
+From: Li Lingfeng <lilingfeng3@huawei.com>
+Date: Mon, 16 Dec 2024 20:15:25 +0800
+Subject: [PATCH 03/17] nfs: fix the loss of superblock's initialized flags
+
+hulk inclusion
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IB42W1
+
+--------------------------------
+
+Commit 573573887e0b ("nfs: pass flags to second superblock") directly
+assigns fc->sb_flags to dentry->d_sb->s_flags, which will cause the loss
+of the initialized flags in dentry->d_sb->s_flags.
+
+Fix it by just passing SB_RDONLY from fc->sb_flags to
+dentry->d_sb->s_flags.
+
+Fixes: 573573887e0b ("nfs: pass flags to second superblock")
+Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
+---
+ fs/nfs/nfs4super.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
+index bb13894ad152..e87f878178f3 100644
+--- a/fs/nfs/nfs4super.c
++++ b/fs/nfs/nfs4super.c
+@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server,
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 
+-	dentry->d_sb->s_flags = fc->sb_flags;
++	dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY);
+ 	fc->root = dentry;
+ 	return 0;
+ }
+-- 
+2.25.1
+
diff --git a/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
new file mode 100644
index 0000000..f9c3ab2
--- /dev/null
+++ b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
@@ -0,0 +1,61 @@
+From 844a44e5a21be8062fd0c120a75e9ecf97427ae8 Mon Sep 17 00:00:00 2001
+From: hanliyang <hanliyang@hygon.cn>
+Date: Mon, 16 Dec 2024 20:44:36 +0800
+Subject: [PATCH 04/17] x86/config: Enable CONFIG_CMA by default in
+ openeuler_defconfig
+
+hygon inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI
+CVE: NA
+
+---------------------------
+
+Enable CONFIG_CMA will change kabi.
+
+Enable CONFIG_CMA will also enable CONFIG_DMA_CMA.
+
+Signed-off-by: hanliyang <hanliyang@hygon.cn>
+---
+ arch/x86/configs/openeuler_defconfig | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
+index 8e8542796a13..adfaef0cb10c 100644
+--- a/arch/x86/configs/openeuler_defconfig
++++ b/arch/x86/configs/openeuler_defconfig
+@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
+ CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
+ CONFIG_USE_PERCPU_NUMA_NODE_ID=y
+ CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+-# CONFIG_CMA is not set
++CONFIG_CMA=y
++# CONFIG_CMA_DEBUG is not set
++# CONFIG_CMA_DEBUGFS is not set
++# CONFIG_CMA_SYSFS is not set
++CONFIG_CMA_AREAS=19
+ CONFIG_MEM_SOFT_DIRTY=y
+ CONFIG_GENERIC_EARLY_IOREMAP=y
+ CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
+@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y
+ CONFIG_SWIOTLB=y
+ # CONFIG_SWIOTLB_DYNAMIC is not set
+ CONFIG_DMA_COHERENT_POOL=y
++CONFIG_DMA_CMA=y
++# CONFIG_DMA_NUMA_CMA is not set
++
++#
++# Default contiguous memory area size:
++#
++CONFIG_CMA_SIZE_MBYTES=0
++CONFIG_CMA_SIZE_SEL_MBYTES=y
++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set
++# CONFIG_CMA_SIZE_SEL_MIN is not set
++# CONFIG_CMA_SIZE_SEL_MAX is not set
++CONFIG_CMA_ALIGNMENT=8
+ # CONFIG_DMA_API_DEBUG is not set
+ # CONFIG_DMA_MAP_BENCHMARK is not set
+ CONFIG_SGL_ALLOC=y
+-- 
+2.25.1
+
diff --git a/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
new file mode 100644
index 0000000..79f223e
--- /dev/null
+++ b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
@@ -0,0 +1,35 @@
+From f0e6b8ca2a5b0bc1347906ff6b80422c4c9878b2 Mon Sep 17 00:00:00 2001
+From: hanliyang <hanliyang@hygon.cn>
+Date: Mon, 16 Dec 2024 20:52:08 +0800
+Subject: [PATCH 05/17] x86/Kconfig: Select CONFIG_CMA if CONFIG_HYGON_CSV=y
+
+hygon inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI
+CVE: NA
+
+---------------------------
+
+The Hygon CSV3 use CMA to manage CSV3 guest's private memory. If the
+CONFIG_HYGON_CSV is enabled, then enable CONFIG_CMA automatically.
+
+Signed-off-by: hanliyang <hanliyang@hygon.cn>
+---
+ arch/x86/Kconfig | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index fcd0c3b2065d..a6bbe6029121 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2075,6 +2075,7 @@ config HYGON_CSV
+ 	bool "Hygon secure virtualization CSV support"
+ 	default y
+ 	depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT
++	select CONFIG_CMA
+ 	help
+ 	  Hygon CSV integrates secure processor, memory encryption and
+ 	  memory isolation to provide the ability to protect guest's private
+-- 
+2.25.1
+
diff --git a/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
new file mode 100644
index 0000000..a07a0a5
--- /dev/null
+++ b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
@@ -0,0 +1,60 @@
+From 44c5a161852ac117a94ed7748784aecaab552b47 Mon Sep 17 00:00:00 2001
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+Date: Tue, 17 Dec 2024 16:33:23 +0800
+Subject: [PATCH 06/17] tcp: Fix use-after-free of nreq in
+ reqsk_timer_handler().
+
+stable inclusion
+from stable-v6.6.64
+commit 65ed89cad1f57034c256b016e89e8c0a4ec7c65b
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBA6RL
+CVE: NA
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65ed89cad1f57034c256b016e89e8c0a4ec7c65b
+
+-------------------------------------------------
+
+[ Upstream commit c31e72d021db2714df03df6c42855a1db592716c ]
+
+The cited commit replaced inet_csk_reqsk_queue_drop_and_put() with
+__inet_csk_reqsk_queue_drop() and reqsk_put() in reqsk_timer_handler().
+
+Then, oreq should be passed to reqsk_put() instead of req; otherwise
+use-after-free of nreq could happen when reqsk is migrated but the
+retry attempt failed (e.g. due to timeout).
+
+Let's pass oreq to reqsk_put().
+
+Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().")
+Reported-by: Liu Jian <liujian56@huawei.com>
+Closes: https://lore.kernel.org/netdev/1284490f-9525-42ee-b7b8-ccadf6606f6d@huawei.com/
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
+Reviewed-by: Liu Jian <liujian56@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
+Link: https://patch.msgid.link/20241123174236.62438-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Liu Jian <liujian56@huawei.com>
+---
+ net/ipv4/inet_connection_sock.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index ca8cc0988b61..bd032ac2376e 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t)
+ 
+ drop:
+ 	__inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+-	reqsk_put(req);
++	reqsk_put(oreq);
+ }
+ 
+ static bool reqsk_queue_hash_req(struct request_sock *req,
+-- 
+2.25.1
+
diff --git a/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
new file mode 100644
index 0000000..9a95845
--- /dev/null
+++ b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
@@ -0,0 +1,63 @@
+From c189729809e4c7a6298126a76db608da2b571240 Mon Sep 17 00:00:00 2001
+From: Pu Lehui <pulehui@huawei.com>
+Date: Wed, 18 Dec 2024 06:24:00 +0000
+Subject: [PATCH 08/17] bpf: Add kabi reserve padding for uapi struct
+ bpf_link_info
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC248
+
+--------------------------------
+
+Add kabi reserve padding for uapi struct bpf_link_info
+
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+---
+ include/uapi/linux/bpf.h       | 9 +++++++++
+ tools/include/uapi/linux/bpf.h | 9 +++++++++
+ 2 files changed, 18 insertions(+)
+
+diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
+index 482647774bf5..a660cb68c853 100644
+--- a/include/uapi/linux/bpf.h
++++ b/include/uapi/linux/bpf.h
+@@ -6573,6 +6573,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
+diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
+index c112c6f7c766..9b302242be6c 100644
+--- a/tools/include/uapi/linux/bpf.h
++++ b/tools/include/uapi/linux/bpf.h
+@@ -6576,6 +6576,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
+-- 
+2.25.1
+
diff --git a/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
new file mode 100644
index 0000000..43e830c
--- /dev/null
+++ b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
@@ -0,0 +1,38 @@
+From bbfb8fd7b1297acf7769a814f3fbf919afd391dc Mon Sep 17 00:00:00 2001
+From: Zhang Zekun <zhangzekun11@huawei.com>
+Date: Wed, 18 Dec 2024 14:43:35 +0800
+Subject: [PATCH 09/17] iommu: Reserve extra KABI entry for struct iopf_group
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBRHP
+
+---------------------------------------------------------------
+
+The list_head entry in iopf_group has been moved to iopf_group_extend
+for KABI compatibility and the lack of KABI reserve entry. Reserve extra
+kabi entry for future usage.
+
+Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
+---
+ include/linux/iommu.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/include/linux/iommu.h b/include/linux/iommu.h
+index bb463cb96a44..83ec4bf9809e 100644
+--- a/include/linux/iommu.h
++++ b/include/linux/iommu.h
+@@ -155,6 +155,10 @@ struct iopf_group {
+ 	KABI_USE(2, u32 cookie)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct iopf_group_extend {
+-- 
+2.25.1
+
diff --git a/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
new file mode 100644
index 0000000..371e3af
--- /dev/null
+++ b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
@@ -0,0 +1,45 @@
+From 1cb26ea1471efb775f2aa141863e82efead07d61 Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Wed, 18 Dec 2024 15:21:56 +0800
+Subject: [PATCH 10/17] seq_file: kabi: KABI reservation for seq_file
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC34X
+
+----------------------------------------------------------------------
+
+    structure                size reserves reserved
+    seq_file                 120     1       128
+    seq_operations           32      1       40
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+---
+ include/linux/seq_file.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
+index 234bcdb1fba4..cf4a2258df85 100644
+--- a/include/linux/seq_file.h
++++ b/include/linux/seq_file.h
+@@ -27,6 +27,8 @@ struct seq_file {
+ 	int poll_event;
+ 	const struct file *file;
+ 	void *private;
++
++	KABI_RESERVE(1)
+ };
+ 
+ struct seq_operations {
+@@ -34,6 +36,8 @@ struct seq_operations {
+ 	void (*stop) (struct seq_file *m, void *v);
+ 	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
+ 	int (*show) (struct seq_file *m, void *v);
++
++	KABI_RESERVE(1)
+ };
+ 
+ #define SEQ_SKIP 1
+-- 
+2.25.1
+
diff --git a/0015-statx-kabi-KABI-reservation-for-kstat.patch b/0015-statx-kabi-KABI-reservation-for-kstat.patch
new file mode 100644
index 0000000..12b7151
--- /dev/null
+++ b/0015-statx-kabi-KABI-reservation-for-kstat.patch
@@ -0,0 +1,38 @@
+From ed5b59b6c40d2563994c1f7b5a1321affb490d45 Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Wed, 18 Dec 2024 15:23:01 +0800
+Subject: [PATCH 11/17] statx: kabi: KABI reservation for kstat
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC24E
+
+----------------------------------------------------------------------
+
+    structure                size reserves reserved  mainline
+    kstat                    160     4       192       184
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+---
+ include/linux/stat.h | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/include/linux/stat.h b/include/linux/stat.h
+index 52150570d37a..d342e89b7aaa 100644
+--- a/include/linux/stat.h
++++ b/include/linux/stat.h
+@@ -53,6 +53,11 @@ struct kstat {
+ 	u32		dio_mem_align;
+ 	u32		dio_offset_align;
+ 	u64		change_cookie;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
+-- 
+2.25.1
+
diff --git a/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
new file mode 100644
index 0000000..ca2556d
--- /dev/null
+++ b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
@@ -0,0 +1,200 @@
+From 30f7b1506ec798949e6ce99c023780b0306845c9 Mon Sep 17 00:00:00 2001
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Date: Wed, 18 Dec 2024 15:31:44 +0800
+Subject: [PATCH 12/17] fs: Allow fine-grained control of folio sizes
+
+mainline inclusion
+from mainline-v6.10-rc2
+commit 84429b675bcfd2a518ae167ee4661cdf7539aa7d
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC20Q
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=84429b675bcfd2a518ae167ee4661cdf7539aa7d
+
+--------------------------------
+
+We need filesystems to be able to communicate acceptable folio sizes
+to the pagecache for a variety of uses (e.g. large block sizes).
+Support a range of folio sizes between order-0 and order-31.
+
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
+Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
+Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com
+Tested-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Conflicts:
+	include/linux/pagemap.h
+	mm/filemap.c
+[Conflicts due to not merged 83ee0e20fd9f ("filemap: support disable large
+folios on active inode")]
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+---
+ include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++------
+ mm/readahead.c          |  4 +-
+ 2 files changed, 79 insertions(+), 15 deletions(-)
+
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 429627abfef4..e44e377661f2 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -203,12 +203,21 @@ enum mapping_flags {
+ 	AS_EXITING	= 4, 	/* final truncate in progress */
+ 	/* writeback related tags are not used */
+ 	AS_NO_WRITEBACK_TAGS = 5,
+-	AS_LARGE_FOLIO_SUPPORT = 6,
+-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+-	AS_STABLE_WRITES,	/* must wait for writeback before modifying
++	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
++	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
+ 				   folio contents */
++	AS_INACCESSIBLE = 8,	/* Do not attempt direct R/W access to the mapping */
++	/* Bits 16-25 are used for FOLIO_ORDER */
++	AS_FOLIO_ORDER_BITS = 5,
++	AS_FOLIO_ORDER_MIN = 16,
++	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
+ };
+ 
++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
++
+ /**
+  * mapping_set_error - record a writeback error in the address_space
+  * @mapping: the mapping in which an error should be set
+@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+ #define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
+ #define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+ 
++/*
++ * mapping_set_folio_order_range() - Set the orders supported by a file.
++ * @mapping: The address space of the file.
++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
++ *
++ * The filesystem should call this function in its inode constructor to
++ * indicate which base size (min) and maximum size (max) of folio the VFS
++ * can use to cache the contents of the file.  This should only be used
++ * if the filesystem needs special handling of folio sizes (ie there is
++ * something the core cannot know).
++ * Do not tune it based on, eg, i_size.
++ *
++ * Context: This should not be called while the inode is active as it
++ * is non-atomic.
++ */
++static inline void mapping_set_folio_order_range(struct address_space *mapping,
++						 unsigned int min,
++						 unsigned int max)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return;
++
++	if (min > MAX_PAGECACHE_ORDER)
++		min = MAX_PAGECACHE_ORDER;
++
++	if (max > MAX_PAGECACHE_ORDER)
++		max = MAX_PAGECACHE_ORDER;
++
++	if (max < min)
++		max = min;
++
++	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
++		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
++}
++
++static inline void mapping_set_folio_min_order(struct address_space *mapping,
++					       unsigned int min)
++{
++	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
++}
++
+ /**
+  * mapping_set_large_folios() - Indicate the file supports large folios.
+- * @mapping: The file.
++ * @mapping: The address space of the file.
+  *
+  * The filesystem should call this function in its inode constructor to
+  * indicate that the VFS can use large folios to cache the contents of
+@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+  */
+ static inline void mapping_set_large_folios(struct address_space *mapping)
+ {
+-	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
++}
++
++static inline unsigned int
++mapping_max_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
++}
++
++static inline unsigned int
++mapping_min_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
+ }
+ 
+ /**
+@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
+ static inline void mapping_clear_large_folios(struct address_space *mapping)
+ {
+ 	WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock));
+-	__clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, 0);
+ }
+ 
+ /*
+@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping)
+  */
+ static inline bool mapping_large_folio_support(struct address_space *mapping)
+ {
+-	/* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
++	/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
+ 	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
+ 			"Anonymous mapping always supports large folio");
+ 
+-	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+-		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	return mapping_max_folio_order(mapping) > 0;
+ }
+ 
+ /* Return the maximum folio size for this pagecache mapping, in bytes. */
+-static inline size_t mapping_max_folio_size(struct address_space *mapping)
++static inline size_t mapping_max_folio_size(const struct address_space *mapping)
+ {
+-	if (mapping_large_folio_support(mapping))
+-		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
+-	return PAGE_SIZE;
++	return PAGE_SIZE << mapping_max_folio_order(mapping);
+ }
+ 
+ static inline int filemap_nr_thps(struct address_space *mapping)
+diff --git a/mm/readahead.c b/mm/readahead.c
+index 438f142a3e74..c13c130efcca 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl,
+ 
+ 	limit = min(limit, index + ra->size - 1);
+ 
+-	if (new_order < MAX_PAGECACHE_ORDER)
++	if (new_order < mapping_max_folio_order(mapping))
+ 		new_order += 2;
+ 
+-	new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
++	new_order = min(mapping_max_folio_order(mapping), new_order);
+ 	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ 
+ 	/* See comment in page_cache_ra_unbounded() */
+-- 
+2.25.1
+
diff --git a/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
new file mode 100644
index 0000000..ebe3ba0
--- /dev/null
+++ b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
@@ -0,0 +1,68 @@
+From 8c8766f9500b9ffdb907d23269aa888d0632e68c Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:10:59 +0000
+Subject: [PATCH 13/17] Revert "cgroup: fix uaf when proc_cpuset_show"
+
+hulk inclusion
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IA9YQ9
+
+--------------------------------
+
+To keep the same with the mainline and backport the lts patch.
+This reverts commit 24c448de81d48ad08925dda9869bcf535a3258b8.
+
+Fixes: 24c448de81d4 ("cgroup: fix uaf when proc_cpuset_show")
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ kernel/cgroup/cpuset.c | 24 ------------------------
+ 1 file changed, 24 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 2c9e50f09fc1..140dfb5ad3fc 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -5185,7 +5185,6 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 	char *buf;
+ 	struct cgroup_subsys_state *css;
+ 	int retval;
+-	struct cgroup *root_cgroup = NULL;
+ 
+ 	retval = -ENOMEM;
+ 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+@@ -5193,32 +5192,9 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 		goto out;
+ 
+ 	css = task_get_css(tsk, cpuset_cgrp_id);
+-	rcu_read_lock();
+-	/*
+-	 * When the cpuset subsystem is mounted on the legacy hierarchy,
+-	 * the top_cpuset.css->cgroup does not hold a reference count of
+-	 * cgroup_root.cgroup. This makes accessing css->cgroup very
+-	 * dangerous because when the cpuset subsystem is remounted to the
+-	 * default hierarchy, the cgroup_root.cgroup that css->cgroup points
+-	 * to will be released, leading to a UAF issue. To avoid this problem,
+-	 * get the reference count of top_cpuset.css->cgroup first.
+-	 *
+-	 * This is ugly!!
+-	 */
+-	if (css == &top_cpuset.css) {
+-		root_cgroup = css->cgroup;
+-		if (!css_tryget_online(&root_cgroup->self)) {
+-			rcu_read_unlock();
+-			retval = -EBUSY;
+-			goto out_free;
+-		}
+-	}
+-	rcu_read_unlock();
+ 	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ 				current->nsproxy->cgroup_ns);
+ 	css_put(css);
+-	if (root_cgroup)
+-		css_put(&root_cgroup->self);
+ 	if (retval >= PATH_MAX)
+ 		retval = -ENAMETOOLONG;
+ 	if (retval < 0)
+-- 
+2.25.1
+
diff --git a/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
new file mode 100644
index 0000000..0c54088
--- /dev/null
+++ b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
@@ -0,0 +1,145 @@
+From 7b6abe1742cbfedea405f03fcf7fc88cacb2a205 Mon Sep 17 00:00:00 2001
+From: Yafang Shao <laoar.shao@gmail.com>
+Date: Wed, 18 Dec 2024 08:11:00 +0000
+Subject: [PATCH 14/17] cgroup: Make operations on the cgroup root_list RCU
+ safe
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+stable inclusion
+from stable-v6.6.47
+commit dd9542ae7c7ca82ed2d7c185754ba9026361f6bc
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dd9542ae7c7ca82ed2d7c185754ba9026361f6bc
+
+--------------------------------
+
+commit d23b5c577715892c87533b13923306acc6243f93 upstream.
+
+At present, when we perform operations on the cgroup root_list, we must
+hold the cgroup_mutex, which is a relatively heavyweight lock. In reality,
+we can make operations on this list RCU-safe, eliminating the need to hold
+the cgroup_mutex during traversal. Modifications to the list only occur in
+the cgroup root setup and destroy paths, which should be infrequent in a
+production environment. In contrast, traversal may occur frequently.
+Therefore, making it RCU-safe would be beneficial.
+
+Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+To: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h     |  1 +
+ kernel/cgroup/cgroup-internal.h |  3 ++-
+ kernel/cgroup/cgroup.c          | 23 ++++++++++++++++-------
+ 3 files changed, 19 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 6e3227a688de..05ece896af7d 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -591,6 +591,7 @@ struct cgroup_root {
+ 
+ 	/* A list running through the active hierarchies */
+ 	struct list_head root_list;
++	struct rcu_head rcu;
+ 
+ 	/* Hierarchy-specific flags */
+ 	unsigned int flags;
+diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
+index 96a9bd2c26f0..f5fb12890645 100644
+--- a/kernel/cgroup/cgroup-internal.h
++++ b/kernel/cgroup/cgroup-internal.h
+@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
+ 
+ /* iterate across the hierarchies */
+ #define for_each_root(root)						\
+-	list_for_each_entry((root), &cgroup_roots, root_list)
++	list_for_each_entry_rcu((root), &cgroup_roots, root_list,	\
++				lockdep_is_held(&cgroup_mutex))
+ 
+ /**
+  * for_each_subsys - iterate all enabled cgroup subsystems
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 52fe6ba2fefd..c26a9b3a3576 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
+ 
+ void cgroup_free_root(struct cgroup_root *root)
+ {
+-	kfree(root);
++	kfree_rcu(root, rcu);
+ }
+ 
+ static void cgroup_destroy_root(struct cgroup_root *root)
+@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
+ 	spin_unlock_irq(&css_set_lock);
+ 
+ 	if (!list_empty(&root->root_list)) {
+-		list_del(&root->root_list);
++		list_del_rcu(&root->root_list);
+ 		cgroup_root_count--;
+ 	}
+ 
+@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ 		}
+ 	}
+ 
+-	BUG_ON(!res_cgroup);
++	/*
++	 * If cgroup_mutex is not held, the cgrp_cset_link will be freed
++	 * before we remove the cgroup root from the root_list. Consequently,
++	 * when accessing a cgroup root, the cset_link may have already been
++	 * freed, resulting in a NULL res_cgroup. However, by holding the
++	 * cgroup_mutex, we ensure that res_cgroup can't be NULL.
++	 * If we don't hold cgroup_mutex in the caller, we must do the NULL
++	 * check.
++	 */
+ 	return res_cgroup;
+ }
+ 
+@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
+ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 					    struct cgroup_root *root)
+ {
+-	lockdep_assert_held(&cgroup_mutex);
+ 	lockdep_assert_held(&css_set_lock);
+ 
+ 	return __cset_cgroup_from_root(cset, root);
+@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 
+ /*
+  * Return the cgroup for "task" from the given hierarchy. Must be
+- * called with cgroup_mutex and css_set_lock held.
++ * called with css_set_lock held to prevent task's groups from being modified.
++ * Must be called with either cgroup_mutex or rcu read lock to prevent the
++ * cgroup root from being destroyed.
+  */
+ struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ 				     struct cgroup_root *root)
+@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
+ 	struct cgroup_root *root = ctx->root;
+ 	struct cgroup *cgrp = &root->cgrp;
+ 
+-	INIT_LIST_HEAD(&root->root_list);
++	INIT_LIST_HEAD_RCU(&root->root_list);
+ 	atomic_set(&root->nr_cgrps, 1);
+ 	cgrp->root = root;
+ 	init_cgroup_housekeeping(cgrp);
+@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+ 	 * care of subsystems' refcounts, which are explicitly dropped in
+ 	 * the failure exit path.
+ 	 */
+-	list_add(&root->root_list, &cgroup_roots);
++	list_add_rcu(&root->root_list, &cgroup_roots);
+ 	cgroup_root_count++;
+ 
+ 	/*
+-- 
+2.25.1
+
diff --git a/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
new file mode 100644
index 0000000..45d7802
--- /dev/null
+++ b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
@@ -0,0 +1,84 @@
+From 4363688e9b49bde3cce7b2ea1882f3d44d1f5289 Mon Sep 17 00:00:00 2001
+From: Waiman Long <longman@redhat.com>
+Date: Wed, 18 Dec 2024 08:11:01 +0000
+Subject: [PATCH 15/17] cgroup: Move rcu_head up near the top of cgroup_root
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+stable inclusion
+from stable-v6.6.47
+commit f3c60ab676bb62e01d004d5b1cf2963a296c8e6a
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f3c60ab676bb62e01d004d5b1cf2963a296c8e6a
+
+--------------------------------
+
+commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream.
+
+Commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU
+safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu()
+for freeing the cgroup_root.
+
+The current implementation of kvfree_rcu(), however, has the limitation
+that the offset of the rcu_head structure within the larger data
+structure must be less than 4096 or the compilation will fail. See the
+macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h
+for more information.
+
+By putting rcu_head below the large cgroup structure, any change to the
+cgroup structure that makes it larger run the risk of causing build
+failure under certain configurations. Commit 77070eeb8821 ("cgroup:
+Avoid false cacheline sharing of read mostly rstat_cpu") happens to be
+the last straw that breaks it. Fix this problem by moving the rcu_head
+structure up before the cgroup structure.
+
+Fixes: 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe")
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/
+Signed-off-by: Waiman Long <longman@redhat.com>
+Acked-by: Yafang Shao <laoar.shao@gmail.com>
+Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
+Reviewed-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Conflicts:
+	include/linux/cgroup-defs.h
+[Context is mismatched for wait_queue_head_t wait was merged]
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 05ece896af7d..8eb518ce87a1 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -573,6 +573,10 @@ struct cgroup_root {
+ 	/* Unique id for this hierarchy. */
+ 	int hierarchy_id;
+ 
++	/* A list running through the active hierarchies */
++	struct list_head root_list;
++	struct rcu_head rcu;    /* Must be near the top */
++
+ 	/*
+ 	 * The root cgroup. The containing cgroup_root will be destroyed on its
+ 	 * release. cgrp->ancestors[0] will be used overflowing into the
+@@ -589,10 +593,6 @@ struct cgroup_root {
+ 	/* Wait while cgroups are being destroyed */
+ 	wait_queue_head_t wait;
+ 
+-	/* A list running through the active hierarchies */
+-	struct list_head root_list;
+-	struct rcu_head rcu;
+-
+ 	/* Hierarchy-specific flags */
+ 	unsigned int flags;
+ 
+-- 
+2.25.1
+
diff --git a/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
new file mode 100644
index 0000000..c528ff3
--- /dev/null
+++ b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
@@ -0,0 +1,110 @@
+From 724b6581cd8b49962e3add6e8795423f2c1390f8 Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:11:02 +0000
+Subject: [PATCH 16/17] cgroup/cpuset: Prevent UAF in proc_cpuset_show()
+
+stable inclusion
+from stable-v6.6.44
+commit 96226fbed566f3f686f53a489a29846f2d538080
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=96226fbed566f3f686f53a489a29846f2d538080
+
+--------------------------------
+
+[ Upstream commit 1be59c97c83ccd67a519d8a49486b3a8a73ca28a ]
+
+An UAF can happen when /proc/cpuset is read as reported in [1].
+
+This can be reproduced by the following methods:
+1.add an mdelay(1000) before acquiring the cgroup_lock In the
+ cgroup_path_ns function.
+2.$cat /proc/<pid>/cpuset   repeatly.
+3.$mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset/
+$umount /sys/fs/cgroup/cpuset/   repeatly.
+
+The race that cause this bug can be shown as below:
+
+(umount)		|	(cat /proc/<pid>/cpuset)
+css_release		|	proc_cpuset_show
+css_release_work_fn	|	css = task_get_css(tsk, cpuset_cgrp_id);
+css_free_rwork_fn	|	cgroup_path_ns(css->cgroup, ...);
+cgroup_destroy_root	|	mutex_lock(&cgroup_mutex);
+rebind_subsystems	|
+cgroup_free_root 	|
+			|	// cgrp was freed, UAF
+			|	cgroup_path_ns_locked(cgrp,..);
+
+When the cpuset is initialized, the root node top_cpuset.css.cgrp
+will point to &cgrp_dfl_root.cgrp. In cgroup v1, the mount operation will
+allocate cgroup_root, and top_cpuset.css.cgrp will point to the allocated
+&cgroup_root.cgrp. When the umount operation is executed,
+top_cpuset.css.cgrp will be rebound to &cgrp_dfl_root.cgrp.
+
+The problem is that when rebinding to cgrp_dfl_root, there are cases
+where the cgroup_root allocated by setting up the root for cgroup v1
+is cached. This could lead to a Use-After-Free (UAF) if it is
+subsequently freed. The descendant cgroups of cgroup v1 can only be
+freed after the css is released. However, the css of the root will never
+be released, yet the cgroup_root should be freed when it is unmounted.
+This means that obtaining a reference to the css of the root does
+not guarantee that css.cgrp->root will not be freed.
+
+Fix this problem by using rcu_read_lock in proc_cpuset_show().
+As cgroup_root is kfree_rcu after commit 331654dc5f40
+("cgroup: Make operations on the cgroup root_list RCU safe"),
+css->cgroup won't be freed during the critical section.
+To call cgroup_path_ns_locked, css_set_lock is needed, so it is safe to
+replace task_get_css with task_css.
+
+[1] https://syzkaller.appspot.com/bug?extid=9b1ff7be974a403aa4cd
+
+Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+
+Conflicts:
+	kernel/cgroup/cpuset.c
+[commit 5715456af3e0 ("kernfs: Convert kernfs_path_from_node_locked()
+from strlcpy() to strscpy()") was not merged]
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ kernel/cgroup/cpuset.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 140dfb5ad3fc..f3cf9b1268e0 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -21,6 +21,7 @@
+  *  License.  See the file COPYING in the main directory of the Linux
+  *  distribution for more details.
+  */
++#include "cgroup-internal.h"
+ 
+ #include <linux/cpu.h>
+ #include <linux/cpumask.h>
+@@ -5191,10 +5192,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 	if (!buf)
+ 		goto out;
+ 
+-	css = task_get_css(tsk, cpuset_cgrp_id);
+-	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+-				current->nsproxy->cgroup_ns);
+-	css_put(css);
++	rcu_read_lock();
++	spin_lock_irq(&css_set_lock);
++	css = task_css(tsk, cpuset_cgrp_id);
++	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
++				       current->nsproxy->cgroup_ns);
++	spin_unlock_irq(&css_set_lock);
++	rcu_read_unlock();
++
+ 	if (retval >= PATH_MAX)
+ 		retval = -ENAMETOOLONG;
+ 	if (retval < 0)
+-- 
+2.25.1
+
diff --git a/0021-cgroup-add-more-reserve-kabi.patch b/0021-cgroup-add-more-reserve-kabi.patch
new file mode 100644
index 0000000..5c0ed08
--- /dev/null
+++ b/0021-cgroup-add-more-reserve-kabi.patch
@@ -0,0 +1,90 @@
+From d68991f87f738657074d93a1ae8ccf865f40b65a Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:11:03 +0000
+Subject: [PATCH 17/17] cgroup: add more reserve kabi
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/I8SA3O
+
+--------------------------------
+
+Reserve KABI for future feature development.
+
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h | 7 +++++++
+ include/linux/memcontrol.h  | 8 ++++++++
+ kernel/cgroup/cpuset.c      | 5 -----
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 8eb518ce87a1..f3fd0407d346 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -325,6 +325,8 @@ struct cgroup_base_stat {
+ #ifdef CONFIG_SCHED_CORE
+ 	u64 forceidle_sum;
+ #endif
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ /*
+@@ -555,6 +557,9 @@ struct cgroup {
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ 	/* All ancestors including self */
+ 	struct cgroup *ancestors[];
+ };
+@@ -606,6 +611,8 @@ struct cgroup_root {
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
+ };
+ 
+ /*
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index b2a80e089a0a..abe236201e68 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -429,6 +429,14 @@ struct mem_cgroup {
+ 	KABI_RESERVE(6)
+ 	KABI_RESERVE(7)
+ 	KABI_RESERVE(8)
++	KABI_RESERVE(9)
++	KABI_RESERVE(10)
++	KABI_RESERVE(11)
++	KABI_RESERVE(12)
++	KABI_RESERVE(13)
++	KABI_RESERVE(14)
++	KABI_RESERVE(15)
++	KABI_RESERVE(16)
+ 	struct mem_cgroup_per_node *nodeinfo[];
+ };
+ 
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index f3cf9b1268e0..7ea0a6d00519 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -211,11 +211,6 @@ struct cpuset {
+ 
+ 	/* Remote partition silbling list anchored at remote_children */
+ 	struct list_head remote_sibling;
+-
+-	KABI_RESERVE(1)
+-	KABI_RESERVE(2)
+-	KABI_RESERVE(3)
+-	KABI_RESERVE(4)
+ };
+ 
+ /*
+-- 
+2.25.1
+
diff --git a/0022-14223.patch b/0022-14223.patch
new file mode 100644
index 0000000..b103427
--- /dev/null
+++ b/0022-14223.patch
@@ -0,0 +1,80 @@
+From f8cb61566576a623971d5cc8dd3cd6229e787e30 Mon Sep 17 00:00:00 2001
+From: Zhang Changzhong <zhangchangzhong@huawei.com>
+Date: Wed, 18 Dec 2024 17:50:29 +0800
+Subject: [PATCH] kabi: net: reserve space for xdp subsystem related structure
+
+hulk inclusion
+category: other
+bugzilla: https://gitee.com/openeuler/kernel/issues/I8OWRC
+
+----------------------------------------------------
+
+Reserve some fields beforehand for xdp framework related structures
+prone to change.
+
+Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
+---
+ include/net/xdp.h | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/include/net/xdp.h b/include/net/xdp.h
+index c283668458ca..9b9c7dc25eeb 100644
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -54,6 +54,9 @@ enum xdp_mem_type {
+ struct xdp_mem_info {
+ 	u32 type; /* enum xdp_mem_type, but known size type */
+ 	u32 id;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ struct page_pool;
+@@ -74,6 +77,9 @@ struct xdp_rxq_info {
+ 
+ struct xdp_txq_info {
+ 	struct net_device *dev;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ enum xdp_buff_flags {
+@@ -92,6 +98,11 @@ struct xdp_buff {
+ 	struct xdp_txq_info *txq;
+ 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
+@@ -181,6 +192,11 @@ struct xdp_frame {
+ 	struct net_device *dev_rx; /* used by cpumap */
+ 	u32 frame_sz;
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
+@@ -198,6 +214,9 @@ struct xdp_frame_bulk {
+ 	int count;
+ 	void *xa;
+ 	void *q[XDP_BULK_QUEUE_SIZE];
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
+-- 
+Gitee
+
diff --git a/0023-14224.patch b/0023-14224.patch
new file mode 100644
index 0000000..62ba017
--- /dev/null
+++ b/0023-14224.patch
@@ -0,0 +1,85 @@
+From a2bbb3a7e3d30f5efc443fa17fcfe20fdd5a98d5 Mon Sep 17 00:00:00 2001
+From: Dong Chenchen <dongchenchen2@huawei.com>
+Date: Wed, 18 Dec 2024 17:15:36 +0800
+Subject: [PATCH] net/kabi: Reserve space for net structures
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC1RH
+
+--------------------------------
+
+Reserve some fields beforehand for net subsystem related
+structures prone to change.
+
+Signed-off-by: Dong Chenchen <dongchenchen2@huawei.com>
+---
+ include/net/flow.h            | 2 ++
+ include/net/netns/netfilter.h | 2 ++
+ include/net/netns/xfrm.h      | 2 ++
+ include/net/xfrm.h            | 4 ++++
+ 4 files changed, 10 insertions(+)
+
+diff --git a/include/net/flow.h b/include/net/flow.h
+index 0cc5f2ef1000..72d2ea2374ba 100644
+--- a/include/net/flow.h
++++ b/include/net/flow.h
+@@ -46,6 +46,8 @@ struct flowi_common {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ union flowi_uli {
+diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
+index 4b77a9b031b6..963588269637 100644
+--- a/include/net/netns/netfilter.h
++++ b/include/net/netns/netfilter.h
+@@ -34,5 +34,7 @@ struct netns_nf {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ #endif
+diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
+index a0c1359cc7eb..af7f20ef4823 100644
+--- a/include/net/netns/xfrm.h
++++ b/include/net/netns/xfrm.h
+@@ -87,6 +87,8 @@ struct netns_xfrm {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ #endif
+diff --git a/include/net/xfrm.h b/include/net/xfrm.h
+index c875faf98492..b9dec5f9c973 100644
+--- a/include/net/xfrm.h
++++ b/include/net/xfrm.h
+@@ -294,6 +294,8 @@ struct xfrm_state {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xs_net(struct xfrm_state *x)
+@@ -562,6 +564,8 @@ struct xfrm_policy {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xp_net(const struct xfrm_policy *xp)
+-- 
+Gitee
+
diff --git a/0024-14225.patch b/0024-14225.patch
new file mode 100644
index 0000000..32a1037
--- /dev/null
+++ b/0024-14225.patch
@@ -0,0 +1,154 @@
+From 279803fa98908bd367cec04ae2600c15764fb977 Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:31 +0000
+Subject: [PATCH 1/3] kabi: reserve space for perf_event.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for perf_event.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ include/linux/perf_event.h | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index 89f2a02db563..fe692e9bd0b2 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context {
+ 	struct hrtimer			hrtimer;
+ 	ktime_t				hrtimer_interval;
+ 	unsigned int			hrtimer_active;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ /**
+@@ -1031,6 +1039,14 @@ struct perf_cpu_context {
+ 	int				heap_size;
+ 	struct perf_event		**heap;
+ 	struct perf_event		*heap_default[2];
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct perf_output_handle {
+-- 
+Gitee
+
+
+From 078ad81846b81844eb98f90eee57c06954715c8d Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:32 +0000
+Subject: [PATCH 2/3] kabi: reserve space for internal.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for internal.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ kernel/events/internal.h | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/kernel/events/internal.h b/kernel/events/internal.h
+index d2e6e6144c54..d1ffa00b91b6 100644
+--- a/kernel/events/internal.h
++++ b/kernel/events/internal.h
+@@ -5,6 +5,7 @@
+ #include <linux/hardirq.h>
+ #include <linux/uaccess.h>
+ #include <linux/refcount.h>
++#include <linux/kabi.h>
+ 
+ /* Buffer handling */
+ 
+@@ -54,6 +55,15 @@ struct perf_buffer {
+ 	void				**aux_pages;
+ 	void				*aux_priv;
+ 
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
++
+ 	struct perf_event_mmap_page	*user_page;
+ 	void				*data_pages[];
+ };
+-- 
+Gitee
+
+
+From 59a2a3e8b1c35d9e0bde08cd2e6f01f1c12d384b Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:33 +0000
+Subject: [PATCH 3/3] kabi: reserve space for uprobes.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for uprobes.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ include/linux/uprobes.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
+index f46e0ca0169c..86d0868b584a 100644
+--- a/include/linux/uprobes.h
++++ b/include/linux/uprobes.h
+@@ -47,6 +47,7 @@ struct uprobe_consumer {
+ 
+ #ifdef CONFIG_UPROBES
+ #include <asm/uprobes.h>
++#include <linux/kabi.h>
+ 
+ enum uprobe_task_state {
+ 	UTASK_RUNNING,
+@@ -78,6 +79,14 @@ struct uprobe_task {
+ 
+ 	struct return_instance		*return_instances;
+ 	unsigned int			depth;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct return_instance {
+-- 
+Gitee
+
diff --git a/0026-14227.patch b/0026-14227.patch
new file mode 100644
index 0000000..4caa739
--- /dev/null
+++ b/0026-14227.patch
@@ -0,0 +1,3464 @@
+From 3c8ff7deba8ed905fb4c3d05ccccdecb6000b7d4 Mon Sep 17 00:00:00 2001
+From: Chengming Zhou <zhouchengming@bytedance.com>
+Date: Wed, 18 Dec 2024 17:51:06 +0800
+Subject: [PATCH 01/14] mm/zswap: invalidate zswap entry when swap entry free
+
+mainline inclusion
+from mainline-v6.9-rc1
+commit 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0827a1fb143fae588cb6f5b9a97c405d6c2ddec9
+
+--------------------------------
+
+During testing I found there are some times the zswap_writeback_entry()
+return -ENOMEM, which is not we expected:
+
+bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
+@[-12]: 1563
+@[0]: 277221
+
+The reason is that __read_swap_cache_async() return NULL because
+swapcache_prepare() failed.  The reason is that we won't invalidate zswap
+entry when swap entry freed to the per-cpu pool, these zswap entries are
+still on the zswap tree and lru list.
+
+This patch moves the invalidation ahead to when swap entry freed to the
+per-cpu pool, since there is no any benefit to leave trashy zswap entry on
+the tree and lru list.
+
+With this patch:
+bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
+@[0]: 259744
+
+Note: large folio can't have zswap entry for now, so don't bother
+to add zswap entry invalidation in the large folio swap free path.
+
+Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com
+Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
+Reviewed-by: Nhat Pham <nphamcs@gmail.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Yosry Ahmed <yosryahmed@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	include/linux/zswap.h
+	mm/zswap.c
+[ Context conflict. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/zswap.h | 4 ++--
+ mm/swap_slots.c       | 4 ++++
+ mm/swapfile.c         | 1 -
+ mm/zswap.c            | 5 +++--
+ 4 files changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/zswap.h b/include/linux/zswap.h
+index 2a60ce39cfde..a13d2d2d9131 100644
+--- a/include/linux/zswap.h
++++ b/include/linux/zswap.h
+@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages;
+ 
+ bool zswap_store(struct folio *folio);
+ bool zswap_load(struct folio *folio);
+-void zswap_invalidate(int type, pgoff_t offset);
++void zswap_invalidate(swp_entry_t swp);
+ void zswap_swapon(int type);
+ void zswap_swapoff(int type);
+ 
+@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio)
+ 	return false;
+ }
+ 
+-static inline void zswap_invalidate(int type, pgoff_t offset) {}
++static inline void zswap_invalidate(swp_entry_t swp) {}
+ static inline void zswap_swapon(int type) {}
+ static inline void zswap_swapoff(int type) {}
+ 
+diff --git a/mm/swap_slots.c b/mm/swap_slots.c
+index 7af3b93d4c8c..5579eed7065f 100644
+--- a/mm/swap_slots.c
++++ b/mm/swap_slots.c
+@@ -34,6 +34,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/mutex.h>
+ #include <linux/mm.h>
++#include <linux/zswap.h>
+ 
+ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry)
+ {
+ 	struct swap_slots_cache *cache;
+ 
++	/* Large folio swap slot is not covered. */
++	zswap_invalidate(entry);
++
+ 	cache = raw_cpu_ptr(&swp_slots);
+ 	if (likely(use_swap_slot_cache && cache->slots_ret)) {
+ 		spin_lock_irq(&cache->free_lock);
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 3af5b6ebb241..30832b85d6c2 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -765,7 +765,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ 		swap_slot_free_notify = NULL;
+ 	while (offset <= end) {
+ 		arch_swap_invalidate_page(si->type, offset);
+-		zswap_invalidate(si->type, offset);
+ 		if (swap_slot_free_notify)
+ 			swap_slot_free_notify(si->bdev, offset);
+ 		offset++;
+diff --git a/mm/zswap.c b/mm/zswap.c
+index 69681b9173fd..5acda5b906bc 100644
+--- a/mm/zswap.c
++++ b/mm/zswap.c
+@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio)
+ 	return ret;
+ }
+ 
+-void zswap_invalidate(int type, pgoff_t offset)
++void zswap_invalidate(swp_entry_t swp)
+ {
+-	struct zswap_tree *tree = zswap_trees[type];
++	pgoff_t offset = swp_offset(swp);
++	struct zswap_tree *tree = zswap_trees[swp_type(swp)];
+ 	struct zswap_entry *entry;
+ 
+ 	/* find */
+-- 
+Gitee
+
+
+From e2f02eacab254e29bd451782950ac6a03de685bd Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:07 +0800
+Subject: [PATCH 02/14] mm: swap: swap cluster switch to double link list
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 73ed0baae66df50359c876f65f41179d6ebd2716
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73ed0baae66df50359c876f65f41179d6ebd2716
+
+--------------------------------
+
+Patch series "mm: swap: mTHP swap allocator base on swap cluster order",
+v5.
+
+This is the short term solutions "swap cluster order" listed in my "Swap
+Abstraction" discussion slice 8 in the recent LSF/MM conference.
+
+When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is
+introduced, it only allocates the mTHP swap entries from the new empty
+cluster list.   It has a fragmentation issue reported by Barry.
+
+https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/
+
+The reason is that all the empty clusters have been exhausted while there
+are plenty of free swap entries in the cluster that are not 100% free.
+
+Remember the swap allocation order in the cluster.  Keep track of the per
+order non full cluster list for later allocation.
+
+This series gives the swap SSD allocation a new separate code path from
+the HDD allocation.  The new allocator use cluster list only and do not
+global scan swap_map[] without lock any more.
+
+This streamline the swap allocation for SSD.  The code matches the
+execution flow much better.
+
+User impact: For users that allocate and free mix order mTHP swapping, It
+greatly improves the success rate of the mTHP swap allocation after the
+initial phase.
+
+It also performs faster when the swapfile is close to full, because the
+allocator can get the non full cluster from a list rather than scanning a
+lot of swap_map entries. 
+
+With Barry's mthp test program V2:
+
+Without:
+$ ./thp_swap_allocator_test -a
+Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71%
+Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00%
+...
+Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test -a -s
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test -s
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+With: # with all 0.00% filter out
+$ ./thp_swap_allocator_test -a | grep -v "0.00%"
+$ # all result are 0.00%
+
+$ ./thp_swap_allocator_test -a -s | grep -v "0.00%"
+./thp_swap_allocator_test -a -s | grep -v "0.00%"
+Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33%
+Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10%
+Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51%
+Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72%
+Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81%
+Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63%
+Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90%
+Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15%
+Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
+Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46%
+Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49%
+Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75%
+Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28%
+Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35%
+Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45%
+Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93%
+Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38%
+Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
+Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45%
+Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64%
+Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30%
+
+$ ./thp_swap_allocator_test
+./thp_swap_allocator_test
+Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
+Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53%
+Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58%
+Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34%
+Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51%
+Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84%
+Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91%
+Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05%
+Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25%
+Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74%
+Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01%
+Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45%
+Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98%
+Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64%
+Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36%
+Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02%
+Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07%
+
+$ ./thp_swap_allocator_test -s
+ ./thp_swap_allocator_test -s
+Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
+Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19%
+Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05%
+Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85%
+Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67%
+Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18%
+Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96%
+Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38%
+Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11%
+Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13%
+Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17%
+Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84%
+Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60%
+Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67%
+Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24%
+
+=========
+Kernel compile under tmpfs with cgroup memory.max = 470M.
+12 core 24 hyperthreading, 32 jobs. 10 Run each group
+
+SSD swap 10 runs average, 20G swap partition:
+With:
+user    2929.064
+system  1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27
+1504.47 1531.4 1532.92 1551.57
+real    1441.324
+
+Without:
+user    2910.872
+system  1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3
+1470.19 1496.32 1544.1 1559.01
+real    1580.822
+
+Two zram swap: zram0 3.0G zram1 20G.
+
+The idea is forcing the zram0 almost full then overflow to zram1:
+
+With:
+user    4320.301
+system  4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06
+4279.85 4285.98 4289.64 4293.13
+real    431.759
+
+Without
+user    4301.393
+system  4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05
+4389.76 4397.16 4398.23 4403.9
+real    433.979
+
+------ more test result from Kaiui ----------
+
+Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem:
+
+System info: 32 Core AMD Zen2, 64G total memory.
+
+Test 3 times using only 4K pages:
+=================================
+
+With:
+-----
+1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k
+1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k
+1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k
+
+Summary (~4.6% improment of system time):
+User: 1839.62
+System: 2443.89: 2465.77 2454.68 2411.21
+Real: 158.88
+
+Without:
+--------
+1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k
+1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k
+1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k
+
+Summary:
+User: 1839.78
+System: 2564.22: 2575.95 2555.15 2561.55
+Real: 162.99
+
+Test 5 times using enabled all mTHP pages:
+==========================================
+
+With:
+-----
+1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k
+1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k
+1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k
+1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k
+1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k
+
+Summary (~8.4% improvement of system time):
+User: 1803.25
+System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08
+Real: 175.90
+
+mTHP swapout status:
+/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721
+/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110
+/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365
+/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269
+/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24
+/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341
+/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145
+/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038
+/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737
+/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808
+/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455
+/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010
+/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973
+/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223
+/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348
+/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541
+
+Without:
+--------
+1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k
+1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k
+1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k
+1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k
+1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k
+
+Summary:
+User: 1811.61
+System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40
+Real: 185.356
+
+mTHP swapout status:
+hugepages-32kB/stats/swpout:35630
+hugepages-32kB/stats/swpout_fallback:1809908
+hugepages-512kB/stats/swpout:523
+hugepages-512kB/stats/swpout_fallback:55235
+hugepages-2048kB/stats/swpout:53
+hugepages-2048kB/stats/swpout_fallback:17264
+hugepages-1024kB/stats/swpout:85
+hugepages-1024kB/stats/swpout_fallback:24979
+hugepages-64kB/stats/swpout:30117
+hugepages-64kB/stats/swpout_fallback:1825399
+hugepages-16kB/stats/swpout:42775
+hugepages-16kB/stats/swpout_fallback:1951123
+hugepages-256kB/stats/swpout:2326
+hugepages-256kB/stats/swpout_fallback:170165
+hugepages-128kB/stats/swpout:17925
+hugepages-128kB/stats/swpout_fallback:1309757
+
+This patch (of 9):
+
+Previously, the swap cluster used a cluster index as a pointer to
+construct a custom single link list type "swap_cluster_list".  The next
+cluster pointer is shared with the cluster->count.  It prevents puting the
+non free cluster into a list.
+
+Change the cluster to use the standard double link list instead.  This
+allows tracing the nonfull cluster in the follow up patch.  That way, it
+is faster to get to the nonfull cluster of that order.
+
+Remove the cluster getter/setter for accessing the cluster struct member.
+
+The list operation is protected by the swap_info_struct->lock.
+
+Change cluster code to use "struct swap_cluster_info *" to reference the
+cluster rather than by using index.  That is more consistent with the list
+manipulation.  It avoids the repeat adding index to the cluser_info.  The
+code is easier to understand.
+
+Remove the cluster next pointer is NULL flag, the double link list can
+handle the empty list pretty well.
+
+The "swap_cluster_info" struct is two pointer bigger, because 512 swap
+entries share one swap_cluster_info struct, it has very little impact on
+the average memory usage per swap entry.  For 1TB swapfile, the swap
+cluster data structure increases from 8MB to 24MB.
+
+Other than the list conversion, there is no real function change in this
+patch.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	include/linux/swap.h
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  25 ++---
+ mm/swapfile.c        | 226 ++++++++++++-------------------------------
+ 2 files changed, 71 insertions(+), 180 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index bea0c0f1f640..94e1b6bb04ce 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -255,22 +255,20 @@ enum {
+  * free clusters are organized into a list. We fetch an entry from the list to
+  * get a free cluster.
+  *
+- * The data field stores next cluster if the cluster is free or cluster usage
+- * counter otherwise. The flags field determines if a cluster is free. This is
+- * protected by swap_info_struct.lock.
++ * The flags field determines if a cluster is free. This is
++ * protected by cluster lock.
+  */
+ struct swap_cluster_info {
+ 	spinlock_t lock;	/*
+ 				 * Protect swap_cluster_info fields
+-				 * and swap_info_struct->swap_map
+-				 * elements correspond to the swap
+-				 * cluster
++				 * other than list, and swap_info_struct->swap_map
++				 * elements corresponding to the swap cluster.
+ 				 */
+-	unsigned int data:24;
+-	unsigned int flags:8;
++	u16 count;
++	u8 flags;
++	struct list_head list;
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -295,11 +293,6 @@ struct percpu_cluster {
+ 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+ };
+ 
+-struct swap_cluster_list {
+-	struct swap_cluster_info head;
+-	struct swap_cluster_info tail;
+-};
+-
+ /*
+  * The in-memory structure used to track swap areas.
+  */
+@@ -312,7 +305,7 @@ struct swap_info_struct {
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+-	struct swap_cluster_list free_clusters; /* free clusters list */
++	struct list_head free_clusters; /* free clusters list */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+@@ -345,7 +338,7 @@ struct swap_info_struct {
+ 					 * list.
+ 					 */
+ 	struct work_struct discard_work; /* discard worker */
+-	struct swap_cluster_list discard_clusters; /* discard clusters list */
++	struct list_head discard_clusters; /* discard clusters list */
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 30832b85d6c2..76b344438606 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -289,62 +289,15 @@ static void discard_swap_cluster(struct swap_info_struct *si,
+ #endif
+ #define LATENCY_LIMIT		256
+ 
+-static inline void cluster_set_flag(struct swap_cluster_info *info,
+-	unsigned int flag)
+-{
+-	info->flags = flag;
+-}
+-
+-static inline unsigned int cluster_count(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_count(struct swap_cluster_info *info,
+-				     unsigned int c)
+-{
+-	info->data = c;
+-}
+-
+-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+-					 unsigned int c, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = c;
+-}
+-
+-static inline unsigned int cluster_next(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_next(struct swap_cluster_info *info,
+-				    unsigned int n)
+-{
+-	info->data = n;
+-}
+-
+-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+-					 unsigned int n, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = n;
+-}
+-
+ static inline bool cluster_is_free(struct swap_cluster_info *info)
+ {
+ 	return info->flags & CLUSTER_FLAG_FREE;
+ }
+ 
+-static inline bool cluster_is_null(struct swap_cluster_info *info)
+-{
+-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
+-}
+-
+-static inline void cluster_set_null(struct swap_cluster_info *info)
++static inline unsigned int cluster_index(struct swap_info_struct *si,
++					 struct swap_cluster_info *ci)
+ {
+-	info->flags = CLUSTER_FLAG_NEXT_NULL;
+-	info->data = 0;
++	return ci - si->cluster_info;
+ }
+ 
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+@@ -393,65 +346,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ 		spin_unlock(&si->lock);
+ }
+ 
+-static inline bool cluster_list_empty(struct swap_cluster_list *list)
+-{
+-	return cluster_is_null(&list->head);
+-}
+-
+-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+-{
+-	return cluster_next(&list->head);
+-}
+-
+-static void cluster_list_init(struct swap_cluster_list *list)
+-{
+-	cluster_set_null(&list->head);
+-	cluster_set_null(&list->tail);
+-}
+-
+-static void cluster_list_add_tail(struct swap_cluster_list *list,
+-				  struct swap_cluster_info *ci,
+-				  unsigned int idx)
+-{
+-	if (cluster_list_empty(list)) {
+-		cluster_set_next_flag(&list->head, idx, 0);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	} else {
+-		struct swap_cluster_info *ci_tail;
+-		unsigned int tail = cluster_next(&list->tail);
+-
+-		/*
+-		 * Nested cluster lock, but both cluster locks are
+-		 * only acquired when we held swap_info_struct->lock
+-		 */
+-		ci_tail = ci + tail;
+-		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+-		cluster_set_next(ci_tail, idx);
+-		spin_unlock(&ci_tail->lock);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	}
+-}
+-
+-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+-					   struct swap_cluster_info *ci)
+-{
+-	unsigned int idx;
+-
+-	idx = cluster_next(&list->head);
+-	if (cluster_next(&list->tail) == idx) {
+-		cluster_set_null(&list->head);
+-		cluster_set_null(&list->tail);
+-	} else
+-		cluster_set_next_flag(&list->head,
+-				      cluster_next(&ci[idx]), 0);
+-
+-	return idx;
+-}
+-
+ /* Add a cluster to discard list and schedule it to do discard */
+ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+-		unsigned int idx)
++		struct swap_cluster_info *ci)
+ {
++	unsigned int idx = cluster_index(si, ci);
+ 	/*
+ 	 * If scan_swap_map_slots() can't find a free cluster, it will check
+ 	 * si->swap_map directly. To make sure the discarding cluster isn't
+@@ -461,17 +360,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+-	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
+-
++	list_add_tail(&ci->list, &si->discard_clusters);
+ 	schedule_work(&si->discard_work);
+ }
+ 
+-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
+-
+-	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+-	cluster_list_add_tail(&si->free_clusters, ci, idx);
++	ci->flags = CLUSTER_FLAG_FREE;
++	list_add_tail(&ci->list, &si->free_clusters);
+ }
+ 
+ /*
+@@ -480,24 +376,25 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+ */
+ static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ {
+-	struct swap_cluster_info *info, *ci;
++	struct swap_cluster_info *ci;
+ 	unsigned int idx;
+ 
+-	info = si->cluster_info;
+-
+-	while (!cluster_list_empty(&si->discard_clusters)) {
+-		idx = cluster_list_del_first(&si->discard_clusters, info);
++	while (!list_empty(&si->discard_clusters)) {
++		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
++		list_del(&ci->list);
++		idx = cluster_index(si, ci);
+ 		spin_unlock(&si->lock);
+ 
+ 		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+ 				SWAPFILE_CLUSTER);
+ 
+ 		spin_lock(&si->lock);
+-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+-		__free_cluster(si, idx);
++
++		spin_lock(&ci->lock);
++		__free_cluster(si, ci);
+ 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 				0, SWAPFILE_CLUSTER);
+-		unlock_cluster(ci);
++		spin_unlock(&ci->lock);
+ 	}
+ }
+ 
+@@ -520,20 +417,21 @@ static void swap_users_ref_free(struct percpu_ref *ref)
+ 	complete(&si->comp);
+ }
+ 
+-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
++static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
++	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
++							struct swap_cluster_info, list);
+ 
+-	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+-	cluster_list_del_first(&si->free_clusters, ci);
+-	cluster_set_count_flag(ci + idx, 0, 0);
++	VM_BUG_ON(cluster_index(si, ci) != idx);
++	list_del(&ci->list);
++	ci->count = 0;
++	ci->flags = 0;
++	return ci;
+ }
+ 
+-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info + idx;
+-
+-	VM_BUG_ON(cluster_count(ci) != 0);
++	VM_BUG_ON(ci->count != 0);
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -541,11 +439,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	 */
+ 	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ 	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+-		swap_cluster_schedule_discard(si, idx);
++		swap_cluster_schedule_discard(si, ci);
+ 		return;
+ 	}
+ 
+-	__free_cluster(si, idx);
++	__free_cluster(si, ci);
+ }
+ 
+ /*
+@@ -558,15 +456,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
+ 	unsigned long count)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
++	struct swap_cluster_info *ci = cluster_info + idx;
+ 
+ 	if (!cluster_info)
+ 		return;
+-	if (cluster_is_free(&cluster_info[idx]))
++	if (cluster_is_free(ci))
+ 		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) + count);
++	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
++	ci->count += count;
+ }
+ 
+ /*
+@@ -580,24 +478,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr decreases one usage. If the usage
+- * counter becomes 0, which means no page in the cluster is in using, we can
+- * optionally discard the cluster and add it to free cluster list.
++ * The cluster ci decreases one usage. If the usage counter becomes 0,
++ * which means no page in the cluster is in use, we can optionally discard
++ * the cluster and add it to free cluster list.
+  */
+-static void dec_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
++static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
+ {
+-	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+-
+-	if (!cluster_info)
++	if (!p->cluster_info)
+ 		return;
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) - 1);
++	VM_BUG_ON(ci->count == 0);
++	ci->count--;
+ 
+-	if (cluster_count(&cluster_info[idx]) == 0)
+-		free_cluster(p, idx);
++	if (!ci->count)
++		free_cluster(p, ci);
+ }
+ 
+ /*
+@@ -610,10 +504,12 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+ {
+ 	struct percpu_cluster *percpu_cluster;
+ 	bool conflict;
++	struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
++							   struct swap_cluster_info, list);
+ 
+ 	offset /= SWAPFILE_CLUSTER;
+-	conflict = !cluster_list_empty(&si->free_clusters) &&
+-		offset != cluster_list_first(&si->free_clusters) &&
++	conflict = !list_empty(&si->free_clusters) &&
++		offset !=  cluster_index(si, first) &&
+ 		cluster_is_free(&si->cluster_info[offset]);
+ 
+ 	if (!conflict)
+@@ -654,10 +550,10 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+ 	tmp = cluster->next[order];
+ 	if (tmp == SWAP_NEXT_INVALID) {
+-		if (!cluster_list_empty(&si->free_clusters)) {
+-			tmp = cluster_next(&si->free_clusters.head) *
+-					SWAPFILE_CLUSTER;
+-		} else if (!cluster_list_empty(&si->discard_clusters)) {
++		if (!list_empty(&si->free_clusters)) {
++			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
++		} else if (!list_empty(&si->discard_clusters)) {
+ 			/*
+ 			 * we don't have free cluster but have some clusters in
+ 			 * discarding, do discard now and reclaim them, then
+@@ -1055,8 +951,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+-	cluster_set_count_flag(ci, 0, 0);
+-	free_cluster(si, idx);
++	ci->count = 0;
++	ci->flags = 0;
++	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+ 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+ }
+@@ -1418,7 +1315,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+ 	count = p->swap_map[offset];
+ 	VM_BUG_ON(count != SWAP_HAS_CACHE);
+ 	p->swap_map[offset] = 0;
+-	dec_cluster_info_page(p, p->cluster_info, offset);
++	dec_cluster_info_page(p, ci);
+ 	unlock_cluster(ci);
+ 
+ 	mem_cgroup_uncharge_swap(entry, 1);
+@@ -3113,8 +3010,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 
+ 	nr_good_pages = maxpages - 1;	/* omit header page */
+ 
+-	cluster_list_init(&p->free_clusters);
+-	cluster_list_init(&p->discard_clusters);
++	INIT_LIST_HEAD(&p->free_clusters);
++	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+@@ -3165,14 +3062,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ 		j = (k + col) % SWAP_CLUSTER_COLS;
+ 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
++			struct swap_cluster_info *ci;
+ 			idx = i * SWAP_CLUSTER_COLS + j;
++			ci = cluster_info + idx;
+ 			if (idx >= nr_clusters)
+ 				continue;
+-			if (cluster_count(&cluster_info[idx]))
++			if (ci->count)
+ 				continue;
+-			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+-			cluster_list_add_tail(&p->free_clusters, cluster_info,
+-					      idx);
++			ci->flags = CLUSTER_FLAG_FREE;
++			list_add_tail(&ci->list, &p->free_clusters);
+ 		}
+ 	}
+ 	return nr_extents;
+-- 
+Gitee
+
+
+From 3bc5a5e67c63e14fe1342ed16ecb304cf60d94b3 Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:08 +0800
+Subject: [PATCH 03/14] mm: swap: mTHP allocate swap entries from nonfull list
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit d07a46a4ac18786e7f4c98fb08525ed80dd1f642
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d07a46a4ac18786e7f4c98fb08525ed80dd1f642
+
+--------------------------------
+
+Track the nonfull cluster as well as the empty cluster on lists.  Each
+order has one nonfull cluster list.
+
+The cluster will remember which order it was used during new cluster
+allocation.
+
+When the cluster has free entry, add to the nonfull[order] list.   When
+the free cluster list is empty, also allocate from the nonempty list of
+that order.
+
+This improves the mTHP swap allocation success rate.
+
+There are limitations if the distribution of numbers of different orders
+of mTHP changes a lot.  e.g.  there are a lot of nonfull cluster assign to
+order A while later time there are a lot of order B allocation while very
+little allocation in order A.  Currently the cluster used by order A will
+not reused by order B unless the cluster is 100% empty.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  4 ++++
+ mm/swapfile.c        | 38 +++++++++++++++++++++++++++++++++++---
+ 2 files changed, 39 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 94e1b6bb04ce..29a1daa46421 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -266,9 +266,11 @@ struct swap_cluster_info {
+ 				 */
+ 	u16 count;
+ 	u8 flags;
++	u8 order;
+ 	struct list_head list;
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -306,6 +308,8 @@ struct swap_info_struct {
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+ 	struct list_head free_clusters; /* free clusters list */
++	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that contains at least one free slot */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 76b344438606..adde6877c0fe 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -360,14 +360,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+-	list_add_tail(&ci->list, &si->discard_clusters);
++	VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
++	if (ci->flags & CLUSTER_FLAG_NONFULL)
++		list_move_tail(&ci->list, &si->discard_clusters);
++	else
++		list_add_tail(&ci->list, &si->discard_clusters);
++	ci->flags = 0;
+ 	schedule_work(&si->discard_work);
+ }
+ 
+ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
++	if (ci->flags & CLUSTER_FLAG_NONFULL)
++		list_move_tail(&ci->list, &si->free_clusters);
++	else
++		list_add_tail(&ci->list, &si->free_clusters);
+ 	ci->flags = CLUSTER_FLAG_FREE;
+-	list_add_tail(&ci->list, &si->free_clusters);
+ }
+ 
+ /*
+@@ -490,8 +498,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 	VM_BUG_ON(ci->count == 0);
+ 	ci->count--;
+ 
+-	if (!ci->count)
++	if (!ci->count) {
+ 		free_cluster(p, ci);
++		return;
++	}
++
++	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
++		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		ci->flags |= CLUSTER_FLAG_NONFULL;
++	}
+ }
+ 
+ /*
+@@ -552,6 +567,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ 	if (tmp == SWAP_NEXT_INVALID) {
+ 		if (!list_empty(&si->free_clusters)) {
+ 			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++			list_del(&ci->list);
++			spin_lock(&ci->lock);
++			ci->order = order;
++			ci->flags = 0;
++			spin_unlock(&ci->lock);
++			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
++		} else if (!list_empty(&si->nonfull_clusters[order])) {
++			ci = list_first_entry(&si->nonfull_clusters[order],
++					      struct swap_cluster_info, list);
++			list_del(&ci->list);
++			spin_lock(&ci->lock);
++			ci->flags = 0;
++			spin_unlock(&ci->lock);
+ 			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+ 		} else if (!list_empty(&si->discard_clusters)) {
+ 			/*
+@@ -952,6 +980,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+ 	ci->count = 0;
++	ci->order = 0;
+ 	ci->flags = 0;
+ 	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+@@ -3013,6 +3042,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	INIT_LIST_HEAD(&p->free_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
++	for (i = 0; i < SWAP_NR_ORDERS; i++)
++		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
++
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+ 		if (page_nr == 0 || page_nr > swap_header->info.last_page)
+-- 
+Gitee
+
+
+From 71c1b6bdf4681e292a269a16337b6fbf64c388d6 Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:09 +0800
+Subject: [PATCH 04/14] mm: swap: separate SSD allocation from
+ scan_swap_map_slots()
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 5f843a9a3a1e865fbf349419bde39977c2e7d3d1
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f843a9a3a1e865fbf349419bde39977c2e7d3d1
+
+--------------------------------
+
+Previously the SSD and HDD share the same swap_map scan loop in
+scan_swap_map_slots().  This function is complex and hard to flow the
+execution flow.
+
+scan_swap_map_try_ssd_cluster() can already do most of the heavy lifting
+to locate the candidate swap range in the cluster.  However it needs to go
+back to scan_swap_map_slots() to check conflict and then perform the
+allocation.
+
+When scan_swap_map_try_ssd_cluster() failed, it still depended on the
+scan_swap_map_slots() to do brute force scanning of the swap_map.  When
+the swapfile is large and almost full, it will take some CPU time to go
+through the swap_map array.
+
+Get rid of the cluster allocation dependency on the swap_map scan loop in
+scan_swap_map_slots().  Streamline the cluster allocation code path.  No
+more conflict checks.
+
+For order 0 swap entry, when run out of free and nonfull list.  It will
+allocate from the higher order nonfull cluster list.
+
+Users should see less CPU time spent on searching the free swap slot when
+swapfile is almost full.
+
+[ryncsn@gmail.com: fix array-bounds error with CONFIG_THP_SWAP=n]
+  Link: https://lkml.kernel.org/r/CAMgjq7Bz0DY+rY0XgCoH7-Q=uHLdo3omi8kUr4ePDweNyofsbQ@mail.gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-3-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 300 ++++++++++++++++++++++++++++----------------------
+ 1 file changed, 168 insertions(+), 132 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index adde6877c0fe..a3e721510311 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -52,6 +52,8 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
++			     unsigned int nr_entries);
+ 
+ static DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -300,6 +302,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
+ 	return ci - si->cluster_info;
+ }
+ 
++static inline unsigned int cluster_offset(struct swap_info_struct *si,
++					  struct swap_cluster_info *ci)
++{
++	return cluster_index(si, ci) * SWAPFILE_CLUSTER;
++}
++
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ 						     unsigned long offset)
+ {
+@@ -371,11 +379,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 
+ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
++
+ 	if (ci->flags & CLUSTER_FLAG_NONFULL)
+ 		list_move_tail(&ci->list, &si->free_clusters);
+ 	else
+ 		list_add_tail(&ci->list, &si->free_clusters);
+ 	ci->flags = CLUSTER_FLAG_FREE;
++	ci->order = 0;
+ }
+ 
+ /*
+@@ -430,9 +442,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
+ 	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
+ 							struct swap_cluster_info, list);
+ 
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 	VM_BUG_ON(cluster_index(si, ci) != idx);
++	VM_BUG_ON(ci->count);
+ 	list_del(&ci->list);
+-	ci->count = 0;
+ 	ci->flags = 0;
+ 	return ci;
+ }
+@@ -440,6 +454,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
+ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+ 	VM_BUG_ON(ci->count != 0);
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -496,6 +512,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 		return;
+ 
+ 	VM_BUG_ON(ci->count == 0);
++	VM_BUG_ON(cluster_is_free(ci));
++	lockdep_assert_held(&p->lock);
++	lockdep_assert_held(&ci->lock);
+ 	ci->count--;
+ 
+ 	if (!ci->count) {
+@@ -504,48 +523,88 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 	}
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
++		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+ 		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		ci->flags |= CLUSTER_FLAG_NONFULL;
++		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+ 
+-/*
+- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
+- * cluster list. Avoiding such abuse to avoid list corruption.
+- */
+-static bool
+-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+-	unsigned long offset, int order)
++static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
++				      unsigned int nr_pages)
+ {
+-	struct percpu_cluster *percpu_cluster;
+-	bool conflict;
+-	struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
+-							   struct swap_cluster_info, list);
+-
+-	offset /= SWAPFILE_CLUSTER;
+-	conflict = !list_empty(&si->free_clusters) &&
+-		offset !=  cluster_index(si, first) &&
+-		cluster_is_free(&si->cluster_info[offset]);
++	unsigned char *p = si->swap_map + start;
++	unsigned char *end = p + nr_pages;
+ 
+-	if (!conflict)
+-		return false;
++	while (p < end)
++		if (*p++)
++			return false;
+ 
+-	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+-	percpu_cluster->next[order] = SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+-static inline bool swap_range_empty(char *swap_map, unsigned int start,
+-				    unsigned int nr_pages)
++
++static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++						unsigned int start, unsigned char usage,
++						unsigned int order)
+ {
+-	unsigned int i;
++	unsigned int nr_pages = 1 << order;
+ 
+-	for (i = 0; i < nr_pages; i++) {
+-		if (swap_map[start + i])
+-			return false;
++	if (cluster_is_free(ci)) {
++		if (nr_pages < SWAPFILE_CLUSTER) {
++			list_move_tail(&ci->list, &si->nonfull_clusters[order]);
++			ci->flags = CLUSTER_FLAG_NONFULL;
++		}
++		ci->order = order;
+ 	}
+ 
+-	return true;
++	memset(si->swap_map + start, usage, nr_pages);
++	swap_range_alloc(si, start, nr_pages);
++	ci->count += nr_pages;
++
++	if (ci->count == SWAPFILE_CLUSTER) {
++		VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
++		list_del(&ci->list);
++		ci->flags = 0;
++	}
++}
++
++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
++					    unsigned int *foundp, unsigned int order,
++					    unsigned char usage)
++{
++	unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
++	unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
++	unsigned int nr_pages = 1 << order;
++	struct swap_cluster_info *ci;
++
++	if (end < nr_pages)
++		return SWAP_NEXT_INVALID;
++	end -= nr_pages;
++
++	ci = lock_cluster(si, offset);
++	if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
++		offset = SWAP_NEXT_INVALID;
++		goto done;
++	}
++
++	while (offset <= end) {
++		if (cluster_scan_range(si, offset, nr_pages)) {
++			cluster_alloc_range(si, ci, offset, usage, order);
++			*foundp = offset;
++			if (ci->count == SWAPFILE_CLUSTER) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
++			offset += nr_pages;
++			break;
++		}
++		offset += nr_pages;
++	}
++	if (offset > end)
++		offset = SWAP_NEXT_INVALID;
++done:
++	unlock_cluster(ci);
++	return offset;
+ }
+ 
+ /*
+@@ -553,72 +612,66 @@ static inline bool swap_range_empty(char *swap_map, unsigned int start,
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+  * too.
+  */
+-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+-	unsigned long *offset, unsigned long *scan_base, int order)
++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
++					      unsigned char usage)
+ {
+-	unsigned int nr_pages = 1 << order;
+ 	struct percpu_cluster *cluster;
+-	struct swap_cluster_info *ci;
+-	unsigned int tmp, max;
++	struct swap_cluster_info *ci, *n;
++	unsigned int offset, found = 0;
+ 
+ new_cluster:
++	lockdep_assert_held(&si->lock);
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+-	tmp = cluster->next[order];
+-	if (tmp == SWAP_NEXT_INVALID) {
+-		if (!list_empty(&si->free_clusters)) {
+-			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+-			list_del(&ci->list);
+-			spin_lock(&ci->lock);
+-			ci->order = order;
+-			ci->flags = 0;
+-			spin_unlock(&ci->lock);
+-			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+-		} else if (!list_empty(&si->nonfull_clusters[order])) {
+-			ci = list_first_entry(&si->nonfull_clusters[order],
+-					      struct swap_cluster_info, list);
+-			list_del(&ci->list);
+-			spin_lock(&ci->lock);
+-			ci->flags = 0;
+-			spin_unlock(&ci->lock);
+-			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+-		} else if (!list_empty(&si->discard_clusters)) {
+-			/*
+-			 * we don't have free cluster but have some clusters in
+-			 * discarding, do discard now and reclaim them, then
+-			 * reread cluster_next_cpu since we dropped si->lock
+-			 */
+-			swap_do_scheduled_discard(si);
+-			*scan_base = this_cpu_read(*si->cluster_next_cpu);
+-			*offset = *scan_base;
+-			goto new_cluster;
+-		} else
+-			return false;
++	offset = cluster->next[order];
++	if (offset) {
++		offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
++		if (found)
++			goto done;
+ 	}
+ 
+-	/*
+-	 * Other CPUs can use our cluster if they can't find a free cluster,
+-	 * check if there is still free entry in the cluster, maintaining
+-	 * natural alignment.
+-	 */
+-	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+-	if (tmp < max) {
+-		ci = lock_cluster(si, tmp);
+-		while (tmp < max) {
+-			if (swap_range_empty(si->swap_map, tmp, nr_pages))
+-				break;
+-			tmp += nr_pages;
++	if (!list_empty(&si->free_clusters)) {
++		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
++		VM_BUG_ON(!found);
++		goto done;
++	}
++
++	if (order < PMD_ORDER) {
++		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, order, usage);
++			if (found)
++				goto done;
+ 		}
+-		unlock_cluster(ci);
+ 	}
+-	if (tmp >= max) {
+-		cluster->next[order] = SWAP_NEXT_INVALID;
++
++	if (!list_empty(&si->discard_clusters)) {
++		/*
++		 * we don't have free cluster but have some clusters in
++		 * discarding, do discard now and reclaim them, then
++		 * reread cluster_next_cpu since we dropped si->lock
++		 */
++		swap_do_scheduled_discard(si);
+ 		goto new_cluster;
+ 	}
+-	*offset = tmp;
+-	*scan_base = tmp;
+-	tmp += nr_pages;
+-	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+-	return true;
++
++	if (order)
++		goto done;
++
++	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
++		if (!list_empty(&si->nonfull_clusters[o])) {
++			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
++					      list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			VM_BUG_ON(!found);
++			goto done;
++		}
++	}
++
++done:
++	cluster->next[order] = offset;
++	return found;
+ }
+ 
+ static void __del_from_avail_list(struct swap_info_struct *p)
+@@ -739,11 +792,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ 	return false;
+ }
+ 
++static int cluster_alloc_swap(struct swap_info_struct *si,
++			     unsigned char usage, int nr,
++			     swp_entry_t slots[], int order)
++{
++	int n_ret = 0;
++
++	VM_BUG_ON(!si->cluster_info);
++
++	while (n_ret < nr) {
++		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
++
++		if (!offset)
++			break;
++		slots[n_ret++] = swp_entry(si->type, offset);
++	}
++
++	return n_ret;
++}
++
+ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			       unsigned char usage, int nr,
+ 			       swp_entry_t slots[], int order)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset;
+ 	unsigned long scan_base;
+ 	unsigned long last_in_cluster = 0;
+@@ -782,26 +853,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			return 0;
+ 	}
+ 
++	if (si->cluster_info)
++		return cluster_alloc_swap(si, usage, nr, slots, order);
++
+ 	si->flags += SWP_SCANNING;
+-	/*
+-	 * Use percpu scan base for SSD to reduce lock contention on
+-	 * cluster and swap cache.  For HDD, sequential access is more
+-	 * important.
+-	 */
+-	if (si->flags & SWP_SOLIDSTATE)
+-		scan_base = this_cpu_read(*si->cluster_next_cpu);
+-	else
+-		scan_base = si->cluster_next;
++
++	/* For HDD, sequential access is more important. */
++	scan_base = si->cluster_next;
+ 	offset = scan_base;
+ 
+-	/* SSD algorithm */
+-	if (si->cluster_info) {
+-		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
+-			if (order > 0)
+-				goto no_page;
+-			goto scan;
+-		}
+-	} else if (unlikely(!si->cluster_nr--)) {
++	if (unlikely(!si->cluster_nr--)) {
+ 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ 			goto checks;
+@@ -812,8 +873,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		/*
+ 		 * If seek is expensive, start searching for new cluster from
+ 		 * start of partition, to minimize the span of allocated swap.
+-		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+-		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ 		 */
+ 		scan_base = offset = si->lowest_bit;
+ 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+@@ -841,19 +900,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ checks:
+-	if (si->cluster_info) {
+-		while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
+-		/* take a break if we already got some slots */
+-			if (n_ret)
+-				goto done;
+-			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+-							&scan_base, order)) {
+-				if (order > 0)
+-					goto no_page;
+-				goto scan;
+-			}
+-		}
+-	}
+ 	if (!(si->flags & SWP_WRITEOK))
+ 		goto no_page;
+ 	if (!si->highest_bit)
+@@ -861,11 +907,9 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	if (offset > si->highest_bit)
+ 		scan_base = offset = si->lowest_bit;
+ 
+-	ci = lock_cluster(si, offset);
+ 	/* reuse swap entry of cache-only swap if not busy. */
+ 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ 		int swap_was_freed;
+-		unlock_cluster(ci);
+ 		spin_unlock(&si->lock);
+ 		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ 		spin_lock(&si->lock);
+@@ -876,15 +920,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ 	if (si->swap_map[offset]) {
+-		unlock_cluster(ci);
+ 		if (!n_ret)
+ 			goto scan;
+ 		else
+ 			goto done;
+ 	}
+ 	memset(si->swap_map + offset, usage, nr_pages);
+-	add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
+-	unlock_cluster(ci);
+ 
+ 	swap_range_alloc(si, offset, nr_pages);
+ 	slots[n_ret++] = swp_entry(si->type, offset);
+@@ -905,13 +946,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		latency_ration = LATENCY_LIMIT;
+ 	}
+ 
+-	/* try to get more slots in cluster */
+-	if (si->cluster_info) {
+-		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
+-			goto checks;
+-		if (order > 0)
+-			goto done;
+-	} else if (si->cluster_nr && !si->swap_map[++offset]) {
++	if (si->cluster_nr && !si->swap_map[++offset]) {
+ 		/* non-ssd case, still more slots in cluster? */
+ 		--si->cluster_nr;
+ 		goto checks;
+@@ -980,8 +1015,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+ 	ci->count = 0;
+-	ci->order = 0;
+-	ci->flags = 0;
+ 	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+ 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+@@ -3099,8 +3132,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 			ci = cluster_info + idx;
+ 			if (idx >= nr_clusters)
+ 				continue;
+-			if (ci->count)
++			if (ci->count) {
++				ci->flags = CLUSTER_FLAG_NONFULL;
++				list_add_tail(&ci->list, &p->nonfull_clusters[0]);
+ 				continue;
++			}
+ 			ci->flags = CLUSTER_FLAG_FREE;
+ 			list_add_tail(&ci->list, &p->free_clusters);
+ 		}
+-- 
+Gitee
+
+
+From 4db67dafd426f7dd2fbde13583c1875a2b242b95 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:10 +0800
+Subject: [PATCH 05/14] mm: swap: clean up initialization helper
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 3b2561b5daeb3531c011491e9a6d2b934cc8f49f
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b2561b5daeb3531c011491e9a6d2b934cc8f49f
+
+--------------------------------
+
+At this point, alloc_cluster is never called already, and
+inc_cluster_info_page is called by initialization only, a lot of dead code
+can be dropped.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-4-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 44 ++++++++++----------------------------------
+ 1 file changed, 10 insertions(+), 34 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index a3e721510311..4be5fbbdc1c8 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -437,20 +437,6 @@ static void swap_users_ref_free(struct percpu_ref *ref)
+ 	complete(&si->comp);
+ }
+ 
+-static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
+-							struct swap_cluster_info, list);
+-
+-	lockdep_assert_held(&si->lock);
+-	lockdep_assert_held(&ci->lock);
+-	VM_BUG_ON(cluster_index(si, ci) != idx);
+-	VM_BUG_ON(ci->count);
+-	list_del(&ci->list);
+-	ci->flags = 0;
+-	return ci;
+-}
+-
+ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+ 	VM_BUG_ON(ci->count != 0);
+@@ -471,34 +457,24 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by
+- * count.
++ * The cluster corresponding to page_nr will be used. The cluster will not be
++ * added to free cluster list and its usage counter will be increased by 1.
++ * Only used for initialization.
+  */
+-static void add_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr,
+-	unsigned long count)
++static void inc_cluster_info_page(struct swap_info_struct *p,
++	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+-	struct swap_cluster_info *ci = cluster_info + idx;
++	struct swap_cluster_info *ci;
+ 
+ 	if (!cluster_info)
+ 		return;
+-	if (cluster_is_free(ci))
+-		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
+-	ci->count += count;
+-}
++	ci = cluster_info + idx;
++	ci->count++;
+ 
+-/*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by 1.
+- */
+-static void inc_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+-{
+-	add_cluster_info_page(p, cluster_info, page_nr, 1);
++	VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
++	VM_BUG_ON(ci->flags);
+ }
+ 
+ /*
+-- 
+Gitee
+
+
+From 18f732c19747e766e0632419f32dfb02768ada67 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:11 +0800
+Subject: [PATCH 06/14] mm: swap: skip slot cache on freeing for mTHP
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 650975d2b181e30c9017c42cb3f6535287555b1e
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=650975d2b181e30c9017c42cb3f6535287555b1e
+
+--------------------------------
+
+Currently when we are freeing mTHP folios from swap cache, we free then
+one by one and put each entry into swap slot cache.  Slot cache is
+designed to reduce the overhead by batching the freeing, but mTHP swap
+entries are already continuous so they can be batch freed without it
+already, it saves litle overhead, or even increase overhead for larger
+mTHP.
+
+What's more, mTHP entries could stay in swap cache for a while.
+Contiguous swap entry is an rather rare resource so releasing them
+directly can help improve mTHP allocation success rate when under
+pressure.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Acked-by: Barry Song <baohua@kernel.org>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 59 +++++++++++++++++++++++----------------------------
+ 1 file changed, 26 insertions(+), 33 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 4be5fbbdc1c8..44726e0b8f8f 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -478,20 +478,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
+ }
+ 
+ /*
+- * The cluster ci decreases one usage. If the usage counter becomes 0,
++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
+  * which means no page in the cluster is in use, we can optionally discard
+  * the cluster and add it to free cluster list.
+  */
+-static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
++static void dec_cluster_info_page(struct swap_info_struct *p,
++				  struct swap_cluster_info *ci, int nr_pages)
+ {
+ 	if (!p->cluster_info)
+ 		return;
+ 
+-	VM_BUG_ON(ci->count == 0);
++	VM_BUG_ON(ci->count < nr_pages);
+ 	VM_BUG_ON(cluster_is_free(ci));
+ 	lockdep_assert_held(&p->lock);
+ 	lockdep_assert_held(&ci->lock);
+-	ci->count--;
++	ci->count -= nr_pages;
+ 
+ 	if (!ci->count) {
+ 		free_cluster(p, ci);
+@@ -983,19 +984,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	return n_ret;
+ }
+ 
+-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	unsigned long offset = idx * SWAPFILE_CLUSTER;
+-	struct swap_cluster_info *ci;
+-
+-	ci = lock_cluster(si, offset);
+-	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+-	ci->count = 0;
+-	free_cluster(si, ci);
+-	unlock_cluster(ci);
+-	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+-}
+-
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+ int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type)
+ {
+@@ -1343,21 +1331,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ 	return usage;
+ }
+ 
+-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
++/*
++ * Drop the last HAS_CACHE flag of swap entries, caller have to
++ * ensure all entries belong to the same cgroup.
++ */
++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
++				  unsigned int nr_pages)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned char count;
++	unsigned char *map = p->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++	struct swap_cluster_info *ci;
+ 
+ 	ci = lock_cluster(p, offset);
+-	count = p->swap_map[offset];
+-	VM_BUG_ON(count != SWAP_HAS_CACHE);
+-	p->swap_map[offset] = 0;
+-	dec_cluster_info_page(p, ci);
++	do {
++		VM_BUG_ON(*map != SWAP_HAS_CACHE);
++		*map = 0;
++	} while (++map < map_end);
++	dec_cluster_info_page(p, ci, nr_pages);
+ 	unlock_cluster(ci);
+ 
+-	mem_cgroup_uncharge_swap(entry, 1);
+-	swap_range_free(p, offset, 1);
++	mem_cgroup_uncharge_swap(entry, nr_pages);
++	swap_range_free(p, offset, nr_pages);
+ }
+ 
+ static void cluster_swap_free_nr(struct swap_info_struct *sis,
+@@ -1418,7 +1413,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
+ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ {
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned long idx = offset / SWAPFILE_CLUSTER;
+ 	struct swap_cluster_info *ci;
+ 	struct swap_info_struct *si;
+ 	unsigned char *map;
+@@ -1431,19 +1425,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 		return;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (size == SWAPFILE_CLUSTER) {
++	if (size > 1) {
+ 		map = si->swap_map + offset;
+-		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
++		for (i = 0; i < size; i++) {
+ 			val = map[i];
+ 			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ 			if (val == SWAP_HAS_CACHE)
+ 				free_entries++;
+ 		}
+-		if (free_entries == SWAPFILE_CLUSTER) {
++		if (free_entries == size) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			spin_lock(&si->lock);
+-			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+-			swap_free_cluster(si, idx);
++			swap_entry_range_free(si, entry, size);
+ 			spin_unlock(&si->lock);
+ 			return;
+ 		}
+@@ -1488,7 +1481,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
+ 	for (i = 0; i < n; ++i) {
+ 		p = swap_info_get_cont(entries[i], prev);
+ 		if (p)
+-			swap_entry_free(p, entries[i]);
++			swap_entry_range_free(p, entries[i], 1);
+ 		prev = p;
+ 	}
+ 	if (p)
+-- 
+Gitee
+
+
+From 53a99352d0946625a0d45deeb8d0729855d4b080 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:12 +0800
+Subject: [PATCH 07/14] mm: swap: allow cache reclaim to skip slot cache
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 862590ac3708e1cbbfb02a8ed78587b86ecba4ba
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=862590ac3708e1cbbfb02a8ed78587b86ecba4ba
+
+--------------------------------
+
+Currently we free the reclaimed slots through slot cache even if the slot
+is required to be empty immediately.  As a result the reclaim caller will
+see the slot still occupied even after a successful reclaim, and need to
+keep reclaiming until slot cache get flushed.  This caused ineffective or
+over reclaim when SWAP is under stress.
+
+So introduce a new flag allowing the slot to be emptied bypassing the slot
+cache.
+
+[21cnbao@gmail.com: small folios should have nr_pages == 1 but not nr_page == 0]
+  Link: https://lkml.kernel.org/r/20240805015324.45134-1-21cnbao@gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 152 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 109 insertions(+), 43 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 44726e0b8f8f..e58457b801fb 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -52,8 +52,15 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
++				  unsigned int nr_pages);
+ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 			     unsigned int nr_entries);
++static bool folio_swapcache_freeable(struct folio *folio);
++static struct swap_cluster_info *lock_cluster_or_swap_info(
++		struct swap_info_struct *si, unsigned long offset);
++static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
++					struct swap_cluster_info *ci);
+ 
+ static DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -128,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent)
+  * corresponding page
+  */
+ #define TTRS_UNMAPPED		0x2
+-/* Reclaim the swap entry if swap is getting full*/
++/* Reclaim the swap entry if swap is getting full */
+ #define TTRS_FULL		0x4
++/* Reclaim directly, bypass the slot cache and don't touch device lock */
++#define TTRS_DIRECT		0x8
++
++static bool swap_is_has_cache(struct swap_info_struct *si,
++			      unsigned long offset, int nr_pages)
++{
++	unsigned char *map = si->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++
++	do {
++		VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
++		if (*map != SWAP_HAS_CACHE)
++			return false;
++	} while (++map < map_end);
++
++	return true;
++}
+ 
+ /*
+  * returns number of pages in the folio that backs the swap entry. If positive,
+@@ -140,12 +164,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 				 unsigned long offset, unsigned long flags)
+ {
+ 	swp_entry_t entry = swp_entry(si->type, offset);
++	struct address_space *address_space = swap_address_space(entry);
++	struct swap_cluster_info *ci;
+ 	struct folio *folio;
+-	int ret = 0;
++	int ret, nr_pages;
++	bool need_reclaim;
+ 
+-	folio = filemap_get_folio(swap_address_space(entry), offset);
++	folio = filemap_get_folio(address_space, offset);
+ 	if (IS_ERR(folio))
+ 		return 0;
++
++	/* offset could point to the middle of a large folio */
++	entry = folio->swap;
++	offset = swp_offset(entry);
++	nr_pages = folio_nr_pages(folio);
++	ret = -nr_pages;
++
+ 	/*
+ 	 * When this function is called from scan_swap_map_slots() and it's
+ 	 * called by vmscan.c at reclaiming folios. So we hold a folio lock
+@@ -153,14 +187,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	 * case and you should use folio_free_swap() with explicit folio_lock()
+ 	 * in usual operations.
+ 	 */
+-	if (folio_trylock(folio)) {
+-		if ((flags & TTRS_ANYWAY) ||
+-		    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+-		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+-			ret = folio_free_swap(folio);
+-		folio_unlock(folio);
++	if (!folio_trylock(folio))
++		goto out;
++
++	need_reclaim = ((flags & TTRS_ANYWAY) ||
++			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
++			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
++	if (!need_reclaim || !folio_swapcache_freeable(folio))
++		goto out_unlock;
++
++	/*
++	 * It's safe to delete the folio from swap cache only if the folio's
++	 * swap_map is HAS_CACHE only, which means the slots have no page table
++	 * reference or pending writeback, and can't be allocated to others.
++	 */
++	ci = lock_cluster_or_swap_info(si, offset);
++	need_reclaim = swap_is_has_cache(si, offset, nr_pages);
++	unlock_cluster_or_swap_info(si, ci);
++	if (!need_reclaim)
++		goto out_unlock;
++
++	if (!(flags & TTRS_DIRECT)) {
++		/* Free through slot cache */
++		delete_from_swap_cache(folio);
++		folio_set_dirty(folio);
++		ret = nr_pages;
++		goto out_unlock;
+ 	}
+-	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
++
++	xa_lock_irq(&address_space->i_pages);
++	__delete_from_swap_cache(folio, entry, NULL);
++	xa_unlock_irq(&address_space->i_pages);
++	folio_ref_sub(folio, nr_pages);
++	folio_set_dirty(folio);
++
++	spin_lock(&si->lock);
++	/* Only sinple page folio can be backed by zswap */
++	if (nr_pages == 1)
++		zswap_invalidate(entry);
++	swap_entry_range_free(si, entry, nr_pages);
++	spin_unlock(&si->lock);
++	ret = nr_pages;
++out_unlock:
++	folio_unlock(folio);
++out:
+ 	folio_put(folio);
+ 	return ret;
+ }
+@@ -888,7 +958,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ 		int swap_was_freed;
+ 		spin_unlock(&si->lock);
+-		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
++		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
+ 		spin_lock(&si->lock);
+ 		/* entry was freed successfully, try to use this again */
+ 		if (swap_was_freed > 0)
+@@ -1415,9 +1485,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 	unsigned long offset = swp_offset(entry);
+ 	struct swap_cluster_info *ci;
+ 	struct swap_info_struct *si;
+-	unsigned char *map;
+-	unsigned int i, free_entries = 0;
+-	unsigned char val;
+ 	int size = 1 << swap_entry_order(folio_order(folio));
+ 
+ 	si = _swap_info_get(entry);
+@@ -1425,23 +1492,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 		return;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (size > 1) {
+-		map = si->swap_map + offset;
+-		for (i = 0; i < size; i++) {
+-			val = map[i];
+-			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+-			if (val == SWAP_HAS_CACHE)
+-				free_entries++;
+-		}
+-		if (free_entries == size) {
+-			unlock_cluster_or_swap_info(si, ci);
+-			spin_lock(&si->lock);
+-			swap_entry_range_free(si, entry, size);
+-			spin_unlock(&si->lock);
+-			return;
+-		}
++	if (size > 1 && swap_is_has_cache(si, offset, size)) {
++		unlock_cluster_or_swap_info(si, ci);
++		spin_lock(&si->lock);
++		swap_entry_range_free(si, entry, size);
++		spin_unlock(&si->lock);
++		return;
+ 	}
+-	for (i = 0; i < size; i++, entry.val++) {
++	for (int i = 0; i < size; i++, entry.val++) {
+ 		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			free_swap_slot(entry);
+@@ -1601,16 +1659,7 @@ static bool folio_swapped(struct folio *folio)
+ 	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
+ }
+ 
+-/**
+- * folio_free_swap() - Free the swap space used for this folio.
+- * @folio: The folio to remove.
+- *
+- * If swap is getting full, or if there are no more mappings of this folio,
+- * then call folio_free_swap to free its swap space.
+- *
+- * Return: true if we were able to release the swap space.
+- */
+-bool folio_free_swap(struct folio *folio)
++static bool folio_swapcache_freeable(struct folio *folio)
+ {
+ 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ 
+@@ -1618,8 +1667,6 @@ bool folio_free_swap(struct folio *folio)
+ 		return false;
+ 	if (folio_test_writeback(folio))
+ 		return false;
+-	if (folio_swapped(folio))
+-		return false;
+ 
+ 	/*
+ 	 * Once hibernation has begun to create its image of memory,
+@@ -1639,6 +1686,25 @@ bool folio_free_swap(struct folio *folio)
+ 	if (pm_suspended_storage())
+ 		return false;
+ 
++	return true;
++}
++
++/**
++ * folio_free_swap() - Free the swap space used for this folio.
++ * @folio: The folio to remove.
++ *
++ * If swap is getting full, or if there are no more mappings of this folio,
++ * then call folio_free_swap to free its swap space.
++ *
++ * Return: true if we were able to release the swap space.
++ */
++bool folio_free_swap(struct folio *folio)
++{
++	if (!folio_swapcache_freeable(folio))
++		return false;
++	if (folio_swapped(folio))
++		return false;
++
+ 	delete_from_swap_cache(folio);
+ 	folio_set_dirty(folio);
+ 	return true;
+@@ -1715,7 +1781,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+ 			 * to the next boundary.
+ 			 */
+ 			nr = __try_to_reclaim_swap(si, offset,
+-					      TTRS_UNMAPPED | TTRS_FULL);
++						   TTRS_UNMAPPED | TTRS_FULL);
+ 			if (nr == 0)
+ 				nr = 1;
+ 			else if (nr < 0)
+-- 
+Gitee
+
+
+From a1f6274ecbb551837ea7a66e740c660f405a2443 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:13 +0800
+Subject: [PATCH 08/14] mm: swap: add a fragment cluster list
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 477cb7ba28892eda112c79d8f75d10edabfc3050
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=477cb7ba28892eda112c79d8f75d10edabfc3050
+
+--------------------------------
+
+Now swap cluster allocator arranges the clusters in LRU style, so the
+"cold" cluster stay at the head of nonfull lists are the ones that were
+used for allocation long time ago and still partially occupied.  So if
+allocator can't find enough contiguous slots to satisfy an high order
+allocation, it's unlikely there will be slot being free on them to satisfy
+the allocation, at least in a short period.
+
+As a result, nonfull cluster scanning will waste time repeatly scanning
+the unusable head of the list.
+
+Also, multiple CPUs could content on the same head cluster of nonfull
+list.  Unlike free clusters which are removed from the list when any CPU
+starts using it, nonfull cluster stays on the head.
+
+So introduce a new list frag list, all scanned nonfull clusters will be
+moved to this list.  Both for avoiding repeated scanning and contention.
+
+Frag list is still used as fallback for allocations, so if one CPU failed
+to allocate one order of slots, it can still steal other CPU's clusters.
+And order 0 will favor the fragmented clusters to better protect nonfull
+clusters
+
+If any slots on a fragment list are being freed, move the fragment list
+back to nonfull list indicating it worth another scan on the cluster.
+Compared to scan upon freeing a slot, this keep the scanning lazy and save
+some CPU if there are still other clusters to use.
+
+It may seems unneccessay to keep the fragmented cluster on list at all if
+they can't be used for specific order allocation.  But this will start to
+make sense once reclaim dring scanning is ready.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  3 +++
+ mm/swapfile.c        | 41 +++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 40 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 29a1daa46421..81188caed2d2 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -271,6 +271,7 @@ struct swap_cluster_info {
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -310,6 +311,8 @@ struct swap_info_struct {
+ 	struct list_head free_clusters; /* free clusters list */
+ 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that contains at least one free slot */
++	struct list_head frag_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that are fragmented or contented */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index e58457b801fb..7c71e7df9cf3 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -571,7 +571,10 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		else
++			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+@@ -609,7 +612,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
+ 	ci->count += nr_pages;
+ 
+ 	if (ci->count == SWAPFILE_CLUSTER) {
+-		VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
++		VM_BUG_ON(!(ci->flags &
++			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+ 		list_del(&ci->list);
+ 		ci->flags = 0;
+ 	}
+@@ -665,6 +669,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	struct percpu_cluster *cluster;
+ 	struct swap_cluster_info *ci, *n;
+ 	unsigned int offset, found = 0;
++	LIST_HEAD(fraged);
+ 
+ new_cluster:
+ 	lockdep_assert_held(&si->lock);
+@@ -685,13 +690,29 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 
+ 	if (order < PMD_ORDER) {
+ 		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
++			list_move_tail(&ci->list, &fraged);
++			ci->flags = CLUSTER_FLAG_FRAG;
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, order, usage);
+ 			if (found)
+-				goto done;
++				break;
+ 		}
++
++		if (!found) {
++			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
++				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++								 &found, order, usage);
++				if (found)
++					break;
++			}
++		}
++
++		list_splice_tail(&fraged, &si->frag_clusters[order]);
+ 	}
+ 
++	if (found)
++		goto done;
++
+ 	if (!list_empty(&si->discard_clusters)) {
+ 		/*
+ 		 * we don't have free cluster but have some clusters in
+@@ -705,7 +726,17 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	if (order)
+ 		goto done;
+ 
++	/* Order 0 stealing from higher order */
+ 	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
++		if (!list_empty(&si->frag_clusters[o])) {
++			ci = list_first_entry(&si->frag_clusters[o],
++					      struct swap_cluster_info, list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
++							 0, usage);
++			VM_BUG_ON(!found);
++			goto done;
++		}
++
+ 		if (!list_empty(&si->nonfull_clusters[o])) {
+ 			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
+ 					      list);
+@@ -3110,8 +3141,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	INIT_LIST_HEAD(&p->free_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+-	for (i = 0; i < SWAP_NR_ORDERS; i++)
++	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+ 		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
++		INIT_LIST_HEAD(&p->frag_clusters[i]);
++	}
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+-- 
+Gitee
+
+
+From 7c0f2c55f9a21373319df1952070b162b3c6be8a Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:14 +0800
+Subject: [PATCH 09/14] mm: swap: relaim the cached parts that got scanned
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 661383c6111a38c88df61af6bfbcfacd2ff20a67
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=661383c6111a38c88df61af6bfbcfacd2ff20a67
+
+--------------------------------
+
+This commit implements reclaim during scan for cluster allocator.
+
+Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
+result in low allocation success rate or early OOM.
+
+So to ensure maximum allocation success rate, integrate reclaiming with
+scanning.  If found a range of suitable swap slots but fragmented due to
+HAS_CACHE, just try to reclaim the slots.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |   1 +
+ mm/swapfile.c        | 140 +++++++++++++++++++++++++++++++++----------
+ 2 files changed, 110 insertions(+), 31 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 81188caed2d2..83b1bcbaf2ec 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -313,6 +313,7 @@ struct swap_info_struct {
+ 					/* list of cluster that contains at least one free slot */
+ 	struct list_head frag_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that are fragmented or contented */
++	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 7c71e7df9cf3..45f73b73a92f 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -512,6 +512,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
+ 	VM_BUG_ON(ci->count != 0);
+ 	lockdep_assert_held(&si->lock);
+ 	lockdep_assert_held(&ci->lock);
++
++	if (ci->flags & CLUSTER_FLAG_FRAG)
++		si->frag_cluster_nr[ci->order]--;
++
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -571,31 +575,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		if (ci->flags & CLUSTER_FLAG_FRAG)
++		if (ci->flags & CLUSTER_FLAG_FRAG) {
++			p->frag_cluster_nr[ci->order]--;
+ 			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		else
++		} else {
+ 			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		}
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+ 
+-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
+-				      unsigned int nr_pages)
++static bool cluster_reclaim_range(struct swap_info_struct *si,
++				  struct swap_cluster_info *ci,
++				  unsigned long start, unsigned long end)
+ {
+-	unsigned char *p = si->swap_map + start;
+-	unsigned char *end = p + nr_pages;
++	unsigned char *map = si->swap_map;
++	unsigned long offset;
++
++	spin_unlock(&ci->lock);
++	spin_unlock(&si->lock);
++
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
++				continue;
++			goto out;
++		default:
++			goto out;
++		}
++	}
++out:
++	spin_lock(&si->lock);
++	spin_lock(&ci->lock);
+ 
+-	while (p < end)
+-		if (*p++)
++	/*
++	 * Recheck the range no matter reclaim succeeded or not, the slot
++	 * could have been be freed while we are not holding the lock.
++	 */
++	for (offset = start; offset < end; offset++)
++		if (READ_ONCE(map[offset]))
+ 			return false;
+ 
+ 	return true;
+ }
+ 
++static bool cluster_scan_range(struct swap_info_struct *si,
++			       struct swap_cluster_info *ci,
++			       unsigned long start, unsigned int nr_pages)
++{
++	unsigned long offset, end = start + nr_pages;
++	unsigned char *map = si->swap_map;
++	bool need_reclaim = false;
+ 
+-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+-						unsigned int start, unsigned char usage,
+-						unsigned int order)
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (!vm_swap_full())
++				return false;
++			need_reclaim = true;
++			continue;
++		default:
++			return false;
++		}
++	}
++
++	if (need_reclaim)
++		return cluster_reclaim_range(si, ci, start, end);
++
++	return true;
++}
++
++static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++				unsigned int start, unsigned char usage,
++				unsigned int order)
+ {
+ 	unsigned int nr_pages = 1 << order;
+ 
+@@ -614,6 +671,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
+ 	if (ci->count == SWAPFILE_CLUSTER) {
+ 		VM_BUG_ON(!(ci->flags &
+ 			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			si->frag_cluster_nr[ci->order]--;
+ 		list_del(&ci->list);
+ 		ci->flags = 0;
+ 	}
+@@ -639,7 +698,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	}
+ 
+ 	while (offset <= end) {
+-		if (cluster_scan_range(si, offset, nr_pages)) {
++		if (cluster_scan_range(si, ci, offset, nr_pages)) {
+ 			cluster_alloc_range(si, ci, offset, usage, order);
+ 			*foundp = offset;
+ 			if (ci->count == SWAPFILE_CLUSTER) {
+@@ -667,9 +726,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 					      unsigned char usage)
+ {
+ 	struct percpu_cluster *cluster;
+-	struct swap_cluster_info *ci, *n;
++	struct swap_cluster_info *ci;
+ 	unsigned int offset, found = 0;
+-	LIST_HEAD(fraged);
+ 
+ new_cluster:
+ 	lockdep_assert_held(&si->lock);
+@@ -689,25 +747,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	}
+ 
+ 	if (order < PMD_ORDER) {
+-		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
+-			list_move_tail(&ci->list, &fraged);
++		unsigned int frags = 0;
++
++		while (!list_empty(&si->nonfull_clusters[order])) {
++			ci = list_first_entry(&si->nonfull_clusters[order],
++					      struct swap_cluster_info, list);
++			list_move_tail(&ci->list, &si->frag_clusters[order]);
+ 			ci->flags = CLUSTER_FLAG_FRAG;
++			si->frag_cluster_nr[order]++;
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, order, usage);
++			frags++;
+ 			if (found)
+ 				break;
+ 		}
+ 
+ 		if (!found) {
+-			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
++			/*
++			 * Nonfull clusters are moved to frag tail if we reached
++			 * here, count them too, don't over scan the frag list.
++			 */
++			while (frags < si->frag_cluster_nr[order]) {
++				ci = list_first_entry(&si->frag_clusters[order],
++						      struct swap_cluster_info, list);
++				/*
++				 * Rotate the frag list to iterate, they were all failing
++				 * high order allocation or moved here due to per-CPU usage,
++				 * this help keeping usable cluster ahead.
++				 */
++				list_move_tail(&ci->list, &si->frag_clusters[order]);
+ 				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 								 &found, order, usage);
++				frags++;
+ 				if (found)
+ 					break;
+ 			}
+ 		}
+-
+-		list_splice_tail(&fraged, &si->frag_clusters[order]);
+ 	}
+ 
+ 	if (found)
+@@ -728,25 +803,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 
+ 	/* Order 0 stealing from higher order */
+ 	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
+-		if (!list_empty(&si->frag_clusters[o])) {
++		/*
++		 * Clusters here have at least one usable slots and can't fail order 0
++		 * allocation, but reclaim may drop si->lock and race with another user.
++		 */
++		while (!list_empty(&si->frag_clusters[o])) {
+ 			ci = list_first_entry(&si->frag_clusters[o],
+ 					      struct swap_cluster_info, list);
+-			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
+-							 0, usage);
+-			VM_BUG_ON(!found);
+-			goto done;
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			if (found)
++				goto done;
+ 		}
+ 
+-		if (!list_empty(&si->nonfull_clusters[o])) {
+-			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
+-					      list);
++		while (!list_empty(&si->nonfull_clusters[o])) {
++			ci = list_first_entry(&si->nonfull_clusters[o],
++					      struct swap_cluster_info, list);
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, 0, usage);
+-			VM_BUG_ON(!found);
+-			goto done;
++			if (found)
++				goto done;
+ 		}
+ 	}
+-
+ done:
+ 	cluster->next[order] = offset;
+ 	return found;
+@@ -3144,6 +3222,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+ 		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
+ 		INIT_LIST_HEAD(&p->frag_clusters[i]);
++		p->frag_cluster_nr[i] = 0;
+ 	}
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+@@ -3187,7 +3266,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	if (!cluster_info)
+ 		return nr_extents;
+ 
+-
+ 	/*
+ 	 * Reduce false cache line sharing between cluster_info and
+ 	 * sharing same address space.
+-- 
+Gitee
+
+
+From da3342ba73e419beb8f4b793ff077b763c27b1df Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:15 +0800
+Subject: [PATCH 10/14] mm: swap: add a adaptive full cluster cache reclaim
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 2cacbdfdee65b18f9952620e762eab043d71b564
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cacbdfdee65b18f9952620e762eab043d71b564
+
+--------------------------------
+
+Link all full cluster with one full list, and reclaim from it when the
+allocation have ran out of all usable clusters.
+
+There are many reason a folio can end up being in the swap cache while
+having no swap count reference.  So the best way to search for such slots
+is still by iterating the swap clusters.
+
+With the list as an LRU, iterating from the oldest cluster and keep them
+rotating is a very doable and clean way to free up potentially not inuse
+clusters.
+
+When any allocation failure, try reclaim and rotate only one cluster.
+This is adaptive for high order allocations they can tolerate fallback.
+So this avoids latency, and give the full cluster list an fair chance to
+get reclaimed.  It release the usage stress for the fallback order 0
+allocation or following up high order allocation.
+
+If the swap device is getting very full, reclaim more aggresively to
+ensure no OOM will happen.  This ensures order 0 heavy workload won't go
+OOM as order 0 won't fail if any cluster still have any space.
+
+[ryncsn@gmail.com: fix discard of full cluster]
+  Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Kairui Song <ryncsn@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  2 ++
+ mm/swapfile.c        | 68 +++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 57 insertions(+), 13 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 83b1bcbaf2ec..1664655aa7c8 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -272,6 +272,7 @@ struct swap_cluster_info {
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+ #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -309,6 +310,7 @@ struct swap_info_struct {
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+ 	struct list_head free_clusters; /* free clusters list */
++	struct list_head full_clusters; /* full clusters list */
+ 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that contains at least one free slot */
+ 	struct list_head frag_clusters[SWAP_NR_ORDERS];
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 45f73b73a92f..389e14f0fc3c 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -439,10 +439,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+ 	VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-	if (ci->flags & CLUSTER_FLAG_NONFULL)
+-		list_move_tail(&ci->list, &si->discard_clusters);
+-	else
+-		list_add_tail(&ci->list, &si->discard_clusters);
++	list_move_tail(&ci->list, &si->discard_clusters);
+ 	ci->flags = 0;
+ 	schedule_work(&si->discard_work);
+ }
+@@ -452,7 +449,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
+ 	lockdep_assert_held(&si->lock);
+ 	lockdep_assert_held(&ci->lock);
+ 
+-	if (ci->flags & CLUSTER_FLAG_NONFULL)
++	if (ci->flags)
+ 		list_move_tail(&ci->list, &si->free_clusters);
+ 	else
+ 		list_add_tail(&ci->list, &si->free_clusters);
+@@ -479,7 +476,6 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ 				SWAPFILE_CLUSTER);
+ 
+ 		spin_lock(&si->lock);
+-
+ 		spin_lock(&ci->lock);
+ 		__free_cluster(si, ci);
+ 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+@@ -575,12 +571,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		if (ci->flags & CLUSTER_FLAG_FRAG) {
++		if (ci->flags & CLUSTER_FLAG_FRAG)
+ 			p->frag_cluster_nr[ci->order]--;
+-			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		} else {
+-			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		}
++		list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+@@ -673,8 +666,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
+ 			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+ 		if (ci->flags & CLUSTER_FLAG_FRAG)
+ 			si->frag_cluster_nr[ci->order]--;
+-		list_del(&ci->list);
+-		ci->flags = 0;
++		list_move_tail(&ci->list, &si->full_clusters);
++		ci->flags = CLUSTER_FLAG_FULL;
+ 	}
+ }
+ 
+@@ -717,6 +710,46 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	return offset;
+ }
+ 
++static void swap_reclaim_full_clusters(struct swap_info_struct *si)
++{
++	long to_scan = 1;
++	unsigned long offset, end;
++	struct swap_cluster_info *ci;
++	unsigned char *map = si->swap_map;
++	int nr_reclaim, total_reclaimed = 0;
++
++	if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
++		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
++
++	while (!list_empty(&si->full_clusters)) {
++		ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
++		list_move_tail(&ci->list, &si->full_clusters);
++		offset = cluster_offset(si, ci);
++		end = min(si->max, offset + SWAPFILE_CLUSTER);
++		to_scan--;
++
++		while (offset < end) {
++			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
++				spin_unlock(&si->lock);
++				nr_reclaim = __try_to_reclaim_swap(si, offset,
++								   TTRS_ANYWAY | TTRS_DIRECT);
++				spin_lock(&si->lock);
++				if (nr_reclaim > 0) {
++					offset += nr_reclaim;
++					total_reclaimed += nr_reclaim;
++					continue;
++				} else if (nr_reclaim < 0) {
++					offset += -nr_reclaim;
++					continue;
++				}
++			}
++			offset++;
++		}
++		if (to_scan <= 0 || total_reclaimed)
++			break;
++	}
++}
++
+ /*
+  * Try to get swap entries with specified order from current cpu's swap entry
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+@@ -825,7 +858,15 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 				goto done;
+ 		}
+ 	}
++
+ done:
++	/* Try reclaim from full clusters if device is nearfull */
++	if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
++		swap_reclaim_full_clusters(si);
++		if (!found && !order && si->pages != si->inuse_pages)
++			goto new_cluster;
++	}
++
+ 	cluster->next[order] = offset;
+ 	return found;
+ }
+@@ -3217,6 +3258,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	nr_good_pages = maxpages - 1;	/* omit header page */
+ 
+ 	INIT_LIST_HEAD(&p->free_clusters);
++	INIT_LIST_HEAD(&p->full_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+ 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+-- 
+Gitee
+
+
+From c58f0af4fa7418fdeb2d6b4d1d8751b751649df9 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:16 +0800
+Subject: [PATCH 11/14] mm, swap: fix allocation and scanning race with swapoff
+
+mainline inclusion
+from mainline-v6.12
+commit 0ec8bc9e880eb576dc4492e8e0c7153ed0a71031
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ec8bc9e880eb576dc4492e8e0c7153ed0a71031
+
+--------------------------------
+
+There are two flags used to synchronize allocation and scanning with
+swapoff: SWP_WRITEOK and SWP_SCANNING.
+
+SWP_WRITEOK: Swapoff will first unset this flag, at this point any further
+swap allocation or scanning on this device should just abort so no more
+new entries will be referencing this device.  Swapoff will then unuse all
+existing swap entries.
+
+SWP_SCANNING: This flag is set when device is being scanned.  Swapoff will
+wait for all scanner to stop before the final release of the swap device
+structures to avoid UAF.  Note this flag is the highest used bit of
+si->flags so it could be added up arithmetically, if there are multiple
+scanner.
+
+commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from
+scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while
+separating cluster allocation path from the old allocation path.  Add the
+flags back to fix swapoff race.  The race is hard to trigger as si->lock
+prevents most parallel operations, but si->lock could be dropped for
+reclaim or discard.  This issue is found during code review.
+
+This commit fixes this problem.  For SWP_SCANNING, Just like before, set
+the flag before scan and remove it afterwards.
+
+For SWP_WRITEOK, there are several places where si->lock could be dropped,
+it will be error-prone and make the code hard to follow if we try to cover
+these places one by one.  So just do one check before the real allocation,
+which is also very similar like before.  With new cluster allocator it may
+waste a bit of time iterating the clusters but won't take long, and
+swapoff is not performance sensitive.
+
+Link: https://lkml.kernel.org/r/20241112083414.78174-1-ryncsn@gmail.com
+Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()")
+Reported-by: "Huang, Ying" <ying.huang@intel.com>
+Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 22 +++++++++++++++++++---
+ 1 file changed, 19 insertions(+), 3 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 389e14f0fc3c..e620040b9181 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -643,12 +643,15 @@ static bool cluster_scan_range(struct swap_info_struct *si,
+ 	return true;
+ }
+ 
+-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+ 				unsigned int start, unsigned char usage,
+ 				unsigned int order)
+ {
+ 	unsigned int nr_pages = 1 << order;
+ 
++	if (!(si->flags & SWP_WRITEOK))
++		return false;
++
+ 	if (cluster_is_free(ci)) {
+ 		if (nr_pages < SWAPFILE_CLUSTER) {
+ 			list_move_tail(&ci->list, &si->nonfull_clusters[order]);
+@@ -669,6 +672,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
+ 		list_move_tail(&ci->list, &si->full_clusters);
+ 		ci->flags = CLUSTER_FLAG_FULL;
+ 	}
++
++	return true;
+ }
+ 
+ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
+@@ -692,7 +697,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 
+ 	while (offset <= end) {
+ 		if (cluster_scan_range(si, ci, offset, nr_pages)) {
+-			cluster_alloc_range(si, ci, offset, usage, order);
++			if (!cluster_alloc_range(si, ci, offset, usage, order)) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
+ 			*foundp = offset;
+ 			if (ci->count == SWAPFILE_CLUSTER) {
+ 				offset = SWAP_NEXT_INVALID;
+@@ -775,7 +783,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	if (!list_empty(&si->free_clusters)) {
+ 		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+ 		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
+-		VM_BUG_ON(!found);
++		/*
++		 * Either we didn't touch the cluster due to swapoff,
++		 * or the allocation must success.
++		 */
++		VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
+ 		goto done;
+ 	}
+ 
+@@ -997,6 +1009,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
+ 
+ 	VM_BUG_ON(!si->cluster_info);
+ 
++	si->flags += SWP_SCANNING;
++
+ 	while (n_ret < nr) {
+ 		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
+ 
+@@ -1005,6 +1019,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
+ 		slots[n_ret++] = swp_entry(si->type, offset);
+ 	}
+ 
++	si->flags -= SWP_SCANNING;
++
+ 	return n_ret;
+ }
+ 
+-- 
+Gitee
+
+
+From 6c0fa586bd1a1b04a8b5bc542e85cee15197075b Mon Sep 17 00:00:00 2001
+From: Jeongjun Park <aha310510@gmail.com>
+Date: Wed, 18 Dec 2024 17:51:17 +0800
+Subject: [PATCH 12/14] mm: swap: prevent possible data-race in
+ __try_to_reclaim_swap
+
+mainline inclusion
+from mainline-v6.12-rc4
+commit 818f916e3a07bf0c64bbf5e250ad209eebe21c85
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=818f916e3a07bf0c64bbf5e250ad209eebe21c85
+
+--------------------------------
+
+A report [1] was uploaded from syzbot.
+
+In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to
+skip slot cache"), the __try_to_reclaim_swap() function reads offset and
+folio->entry from folio without folio_lock protection.
+
+In the currently reported KCSAN log, it is assumed that the actual
+data-race will not occur because the calltrace that does WRITE already
+obtains the folio_lock and then writes.
+
+However, the existing __try_to_reclaim_swap() function was already
+implemented to perform reads under folio_lock protection [1], and there is
+a risk of a data-race occurring through a function other than the one
+shown in the KCSAN log.
+
+Therefore, I think it is appropriate to change
+read operations for folio to be performed under folio_lock.
+
+[1]
+
+==================================================================
+BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap
+
+write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0:
+ __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163
+ delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243
+ folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850
+ free_swap_cache mm/swap_state.c:293 [inline]
+ free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325
+ __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline]
+ tlb_batch_pages_flush mm/mmu_gather.c:149 [inline]
+ tlb_flush_mmu_free mm/mmu_gather.c:366 [inline]
+ tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373
+ zap_pte_range mm/memory.c:1700 [inline]
+ zap_pmd_range mm/memory.c:1739 [inline]
+ zap_pud_range mm/memory.c:1768 [inline]
+ zap_p4d_range mm/memory.c:1789 [inline]
+ unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810
+ unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
+ unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
+ exit_mmap+0x18a/0x690 mm/mmap.c:1864
+ __mmput+0x28/0x1b0 kernel/fork.c:1347
+ mmput+0x4c/0x60 kernel/fork.c:1369
+ exit_mm+0xe4/0x190 kernel/exit.c:571
+ do_exit+0x55e/0x17f0 kernel/exit.c:926
+ do_group_exit+0x102/0x150 kernel/exit.c:1088
+ get_signal+0xf2a/0x1070 kernel/signal.c:2917
+ arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337
+ exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
+ exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
+ __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
+ syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218
+ do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1:
+ __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198
+ free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915
+ zap_pte_range mm/memory.c:1656 [inline]
+ zap_pmd_range mm/memory.c:1739 [inline]
+ zap_pud_range mm/memory.c:1768 [inline]
+ zap_p4d_range mm/memory.c:1789 [inline]
+ unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810
+ unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
+ unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
+ exit_mmap+0x18a/0x690 mm/mmap.c:1864
+ __mmput+0x28/0x1b0 kernel/fork.c:1347
+ mmput+0x4c/0x60 kernel/fork.c:1369
+ exit_mm+0xe4/0x190 kernel/exit.c:571
+ do_exit+0x55e/0x17f0 kernel/exit.c:926
+ __do_sys_exit kernel/exit.c:1055 [inline]
+ __se_sys_exit kernel/exit.c:1053 [inline]
+ __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053
+ x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+value changed: 0x0000000000000242 -> 0x0000000000000000
+
+Link: https://lkml.kernel.org/r/20241007070623.23340-1-aha310510@gmail.com
+Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com
+Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Reviewed-by: Kairui Song <kasong@tencent.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index e620040b9181..c5148f16fb53 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -174,9 +174,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	if (IS_ERR(folio))
+ 		return 0;
+ 
+-	/* offset could point to the middle of a large folio */
+-	entry = folio->swap;
+-	offset = swp_offset(entry);
+ 	nr_pages = folio_nr_pages(folio);
+ 	ret = -nr_pages;
+ 
+@@ -190,6 +187,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	if (!folio_trylock(folio))
+ 		goto out;
+ 
++	/* offset could point to the middle of a large folio */
++	entry = folio->swap;
++	offset = swp_offset(entry);
++
+ 	need_reclaim = ((flags & TTRS_ANYWAY) ||
+ 			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ 			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
+-- 
+Gitee
+
+
+From 849e43b208ba22a3ce5dd24388afe85ee6d30e82 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:18 +0800
+Subject: [PATCH 13/14] mm, swap: avoid over reclaim of full clusters
+
+mainline inclusion
+from mainline-v6.12-rc6
+commit 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5168a68eb78fa1c67a8b2d31d0642c7fd866cc12
+
+--------------------------------
+
+When running low on usable slots, cluster allocator will try to reclaim
+the full clusters aggressively to reclaim HAS_CACHE slots.  This
+guarantees that as long as there are any usable slots, HAS_CACHE or not,
+the swap device will be usable and workload won't go OOM early.
+
+Before the cluster allocator, swap allocator fails easily if device is
+filled up with reclaimable HAS_CACHE slots.  Which can be easily
+reproduced with following simple program:
+
+    #include <stdio.h>
+    #include <string.h>
+    #include <linux/mman.h>
+    #include <sys/mman.h>
+    #define SIZE 8192UL * 1024UL * 1024UL
+    int main(int argc, char **argv) {
+        long tmp;
+        char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        memset(p, 0, SIZE);
+        madvise(p, SIZE, MADV_PAGEOUT);
+        for (unsigned long i = 0; i < SIZE; ++i)
+            tmp += p[i];
+        getchar(); /* Pause */
+        return 0;
+    }
+
+Setup an 8G non ramdisk swap, the first run of the program will swapout 8G
+ram successfully.  But run same program again after the first run paused,
+the second run can't swapout all 8G memory as now half of the swap device
+is pinned by HAS_CACHE.  There was a random scan in the old allocator that
+may reclaim part of the HAS_CACHE by luck, but it's unreliable.
+
+The new allocator's added reclaim of full clusters when device is low on
+usable slots.  But when multiple CPUs are seeing the device is low on
+usable slots at the same time, they ran into a thundering herd problem.
+
+This is an observable problem on large machine with mass parallel
+workload, as full cluster reclaim is slower on large swap device and
+higher number of CPUs will also make things worse.
+
+Testing using a 128G ZRAM on a 48c96t system.  When the swap device is
+very close to full (eg.  124G / 128G), running build linux kernel with
+make -j96 in a 1G memory cgroup will hung (not a softlockup though)
+spinning in full cluster reclaim for about ~5min before go OOM.
+
+To solve this, split the full reclaim into two parts:
+
+- Instead of do a synchronous aggressively reclaim when device is low,
+  do only one aggressively reclaim when device is strictly full with a
+  kworker. This still ensures in worst case the device won't be unusable
+  because of HAS_CACHE slots.
+
+- To avoid allocation (especially higher order) suffer from HAS_CACHE
+  filling up clusters and kworker not responsive enough, do one synchronous
+  scan every time the free list is drained, and only scan one cluster. This
+  is kind of similar to the random reclaim before, keeps the full clusters
+  rotated and has a minimal latency. This should provide a fair reclaim
+  strategy suitable for most workloads.
+
+Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com
+Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim")
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Yosry Ahmed <yosryahmed@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+[ Context conflict with commit b85508d7de90. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  1 +
+ mm/swapfile.c        | 49 +++++++++++++++++++++++++++-----------------
+ 2 files changed, 31 insertions(+), 19 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 1664655aa7c8..33396153afc0 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -348,6 +348,7 @@ struct swap_info_struct {
+ 					 * list.
+ 					 */
+ 	struct work_struct discard_work; /* discard worker */
++	struct work_struct reclaim_work; /* reclaim worker */
+ 	struct list_head discard_clusters; /* discard clusters list */
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index c5148f16fb53..6f3cbf3a2f0d 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -719,15 +719,16 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	return offset;
+ }
+ 
+-static void swap_reclaim_full_clusters(struct swap_info_struct *si)
++/* Return true if reclaimed a whole cluster */
++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
+ {
+ 	long to_scan = 1;
+ 	unsigned long offset, end;
+ 	struct swap_cluster_info *ci;
+ 	unsigned char *map = si->swap_map;
+-	int nr_reclaim, total_reclaimed = 0;
++	int nr_reclaim;
+ 
+-	if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
++	if (force)
+ 		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ 
+ 	while (!list_empty(&si->full_clusters)) {
+@@ -737,28 +738,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si)
+ 		end = min(si->max, offset + SWAPFILE_CLUSTER);
+ 		to_scan--;
+ 
++		spin_unlock(&si->lock);
+ 		while (offset < end) {
+ 			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+-				spin_unlock(&si->lock);
+ 				nr_reclaim = __try_to_reclaim_swap(si, offset,
+ 								   TTRS_ANYWAY | TTRS_DIRECT);
+-				spin_lock(&si->lock);
+-				if (nr_reclaim > 0) {
+-					offset += nr_reclaim;
+-					total_reclaimed += nr_reclaim;
+-					continue;
+-				} else if (nr_reclaim < 0) {
+-					offset += -nr_reclaim;
++				if (nr_reclaim) {
++					offset += abs(nr_reclaim);
+ 					continue;
+ 				}
+ 			}
+ 			offset++;
+ 		}
+-		if (to_scan <= 0 || total_reclaimed)
++		spin_lock(&si->lock);
++
++		if (to_scan <= 0)
+ 			break;
+ 	}
+ }
+ 
++static void swap_reclaim_work(struct work_struct *work)
++{
++	struct swap_info_struct *si;
++
++	si = container_of(work, struct swap_info_struct, reclaim_work);
++
++	spin_lock(&si->lock);
++	swap_reclaim_full_clusters(si, true);
++	spin_unlock(&si->lock);
++}
++
+ /*
+  * Try to get swap entries with specified order from current cpu's swap entry
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+@@ -792,6 +801,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 		goto done;
+ 	}
+ 
++	/* Try reclaim from full clusters if free clusters list is drained */
++	if (vm_swap_full())
++		swap_reclaim_full_clusters(si, false);
++
+ 	if (order < PMD_ORDER) {
+ 		unsigned int frags = 0;
+ 
+@@ -873,13 +886,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	}
+ 
+ done:
+-	/* Try reclaim from full clusters if device is nearfull */
+-	if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
+-		swap_reclaim_full_clusters(si);
+-		if (!found && !order && si->pages != si->inuse_pages)
+-			goto new_cluster;
+-	}
+-
+ 	cluster->next[order] = offset;
+ 	return found;
+ }
+@@ -914,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
+ 		del_from_avail_list(si);
++
++		if (vm_swap_full())
++			schedule_work(&si->reclaim_work);
+ 	}
+ }
+ 
+@@ -2846,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	wait_for_completion(&p->comp);
+ 
+ 	flush_work(&p->discard_work);
++	flush_work(&p->reclaim_work);
+ 
+ 	destroy_swap_extents(p);
+ 	if (p->flags & SWP_CONTINUED)
+@@ -3382,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+ 		return PTR_ERR(p);
+ 
+ 	INIT_WORK(&p->discard_work, swap_discard_work);
++	INIT_WORK(&p->reclaim_work, swap_reclaim_work);
+ 
+ 	name = getname(specialfile);
+ 	if (IS_ERR(name)) {
+-- 
+Gitee
+
+
+From f19bcc77fc060549322618028b1ab9df253474ea Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:51:19 +0800
+Subject: [PATCH 14/14] mm: swapfile: fix cluster reclaim work crash on
+ rotational devices
+
+mainline inclusion
+from mainline-v6.12
+commit dcf32ea7ecede94796fb30231b3969d7c838374c
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dcf32ea7ecede94796fb30231b3969d7c838374c
+
+--------------------------------
+
+syzbot and Daan report a NULL pointer crash in the new full swap cluster
+reclaim work:
+
+> Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI
+> KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
+> CPU: 1 UID: 0 PID: 51 Comm: kworker/1:1 Not tainted 6.12.0-rc6-syzkaller #0
+> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
+> Workqueue: events swap_reclaim_work
+> RIP: 0010:__list_del_entry_valid_or_report+0x20/0x1c0 lib/list_debug.c:49
+> Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 89 fe 48 83 c7 08 48 83 ec 18 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 19 01 00 00 48 89 f2 48 8b 4e 08 48 b8 00 00 00
+> RSP: 0018:ffffc90000bb7c30 EFLAGS: 00010202
+> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff88807b9ae078
+> RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000008
+> RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000
+> R10: 0000000000000001 R11: 000000000000004f R12: dffffc0000000000
+> R13: ffffffffffffffb8 R14: ffff88807b9ae000 R15: ffffc90003af1000
+> FS:  0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000
+> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+> CR2: 00007fffaca68fb8 CR3: 00000000791c8000 CR4: 00000000003526f0
+> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+> Call Trace:
+>  <TASK>
+>  __list_del_entry_valid include/linux/list.h:124 [inline]
+>  __list_del_entry include/linux/list.h:215 [inline]
+>  list_move_tail include/linux/list.h:310 [inline]
+>  swap_reclaim_full_clusters+0x109/0x460 mm/swapfile.c:748
+>  swap_reclaim_work+0x2e/0x40 mm/swapfile.c:779
+
+The syzbot console output indicates a virtual environment where swapfile
+is on a rotational device.  In this case, clusters aren't actually used,
+and si->full_clusters is not initialized.  Daan's report is from qemu, so
+likely rotational too.
+
+Make sure to only schedule the cluster reclaim work when clusters are
+actually in use.
+
+Link: https://lkml.kernel.org/r/20241107142335.GB1172372@cmpxchg.org
+Link: https://lore.kernel.org/lkml/672ac50b.050a0220.2edce.1517.GAE@google.com/
+Link: https://github.com/systemd/systemd/issues/35044
+Fixes: 5168a68eb78f ("mm, swap: avoid over reclaim of full clusters")
+Reported-by: syzbot+078be8bfa863cb9e0c6b@syzkaller.appspotmail.com
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Cc: Kairui Song <ryncsn@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 6f3cbf3a2f0d..3b48159820f2 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -921,7 +921,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 		si->highest_bit = 0;
+ 		del_from_avail_list(si);
+ 
+-		if (vm_swap_full())
++		if (si->cluster_info && vm_swap_full())
+ 			schedule_work(&si->reclaim_work);
+ 	}
+ }
+-- 
+Gitee
+
diff --git a/kernel.spec b/kernel.spec
index 9e7c04d..41a61f1 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -1,5 +1,5 @@
 %define with_signmodules  1
-%define with_kabichk 1
+%define with_kabichk 0
 
 # Default without toolchain_clang
 %bcond_with toolchain_clang
@@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig
 %global upstream_sublevel   0
 %global devel_release       68
 %global maintenance_release .0.0
-%global pkg_release         .74
+%global pkg_release         .77
 
 %global openeuler_lts       1
 %global openeuler_major     2403
@@ -130,6 +130,25 @@ Patch0001: 0001-riscv-kernel.patch
 Patch0002: 0002-cpupower-clang-compile-support.patch
 Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch
 Patch0004: 0004-turbostat-clang-compile-support.patch
+Patch0005: 0005-include-msi-modify-kabi-size-of-msi_desc.patch
+Patch0007: 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
+Patch0008: 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
+Patch0009: 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
+Patch0010: 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
+Patch0012: 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
+Patch0013: 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
+Patch0014: 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
+Patch0015: 0015-statx-kabi-KABI-reservation-for-kstat.patch
+Patch0016: 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
+Patch0017: 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
+Patch0018: 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
+Patch0019: 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
+Patch0020: 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
+Patch0021: 0021-cgroup-add-more-reserve-kabi.patch
+Patch0022: 0022-14223.patch
+Patch0023: 0023-14224.patch
+Patch0024: 0024-14225.patch
+Patch0026: 0026-14227.patch
 
 #BuildRequires:
 BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar
@@ -332,6 +351,26 @@ tar -xjf %{SOURCE9998}
 mv kernel linux-%{KernelVer}
 cd linux-%{KernelVer}
 
+%patch0005 -p1
+%patch0007 -p1
+%patch0008 -p1
+%patch0009 -p1
+%patch0010 -p1
+%patch0012 -p1
+%patch0013 -p1
+%patch0014 -p1
+%patch0015 -p1
+%patch0016 -p1
+%patch0017 -p1
+%patch0018 -p1
+%patch0019 -p1
+%patch0020 -p1
+%patch0021 -p1
+%patch0022 -p1
+%patch0023 -p1
+%patch0024 -p1
+%patch0026 -p1
+
 %if 0%{?with_patch}
 cp %{SOURCE9000} .
 cp %{SOURCE9001} .
@@ -1092,6 +1131,9 @@ fi
 %endif
 
 %changelog
+* Thu Dec 19 2024 Zheng Zengkai <zhengzengkai@huawei.com> - 6.6.0-68.0.0.77
+- performance test for kabi exclude sched
+
 * Wed Dec 18 2024 Liu Yanze <lyz25354840@gmail.com> - 6.6.0-68.0.0.74
 - kabi: add kabi_ext2 list for checking
 - kernel.spec: fix with_kabichk on non-arm64 platform
-- 
Gitee