From cdb4b4ae3c7fa3dcc2fb1c0c55968e681251bc25 Mon Sep 17 00:00:00 2001 From: Zheng Zengkai Date: Wed, 18 Dec 2024 17:40:25 +0800 Subject: [PATCH] performance test for kabi preserve 202412192113 Signed-off-by: Zheng Zengkai --- ...ude-msi-modify-kabi-size-of-msi_desc.patch | 45 + ...ss-of-superblock-s-initialized-flags.patch | 40 + ...e-CONFIG_CMA-by-default-in-openeuler.patch | 61 + ...ect-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch | 35 + ...r-free-of-nreq-in-reqsk_timer_handle.patch | 60 + ...erve-padding-for-uapi-struct-bpf_lin.patch | 63 + ...tra-KABI-entry-for-struct-iopf_group.patch | 38 + ...e-kabi-KABI-reservation-for-seq_file.patch | 45 + ...tatx-kabi-KABI-reservation-for-kstat.patch | 38 + ...-fine-grained-control-of-folio-sizes.patch | 200 + ...cgroup-fix-uaf-when-proc_cpuset_show.patch | 68 + ...ations-on-the-cgroup-root_list-RCU-s.patch | 145 + ..._head-up-near-the-top-of-cgroup_root.patch | 84 + ...uset-Prevent-UAF-in-proc_cpuset_show.patch | 110 + 0021-cgroup-add-more-reserve-kabi.patch | 90 + 0022-14223.patch | 80 + 0023-14224.patch | 85 + 0024-14225.patch | 154 + 0026-14227.patch | 3464 +++++++++++++++++ kernel.spec | 46 +- 20 files changed, 4949 insertions(+), 2 deletions(-) create mode 100644 0005-include-msi-modify-kabi-size-of-msi_desc.patch create mode 100644 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch create mode 100644 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch create mode 100644 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch create mode 100644 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch create mode 100644 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch create mode 100644 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch create mode 100644 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch create mode 100644 0015-statx-kabi-KABI-reservation-for-kstat.patch create mode 100644 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch create mode 100644 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch create mode 100644 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch create mode 100644 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch create mode 100644 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch create mode 100644 0021-cgroup-add-more-reserve-kabi.patch create mode 100644 0022-14223.patch create mode 100644 0023-14224.patch create mode 100644 0024-14225.patch create mode 100644 0026-14227.patch diff --git a/0005-include-msi-modify-kabi-size-of-msi_desc.patch b/0005-include-msi-modify-kabi-size-of-msi_desc.patch new file mode 100644 index 0000000..79c77ab --- /dev/null +++ b/0005-include-msi-modify-kabi-size-of-msi_desc.patch @@ -0,0 +1,45 @@ +From 723d41836db7669ab658d3e07c62fcbe17d7d7f4 Mon Sep 17 00:00:00 2001 +From: zhengjunlong +Date: Fri, 11 Oct 2024 17:08:35 +0800 +Subject: [PATCH 01/17] include/msi: modify kabi size of msi_desc + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IAW8JF + +---------------------------------------------------- + +Change the size of the pre-embedded memory for msi_desc to 40 bytes. + +Signed-off-by: Zheng Junlong +--- + include/linux/msi.h | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/include/linux/msi.h b/include/linux/msi.h +index 7354ffb14856..5fd8a6caae98 100644 +--- a/include/linux/msi.h ++++ b/include/linux/msi.h +@@ -205,15 +205,12 @@ struct msi_desc { + union { + struct pci_msi_desc pci; + struct msi_desc_data data; +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) ++ KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5) + }; ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + KABI_RESERVE(5) +- KABI_RESERVE(6) +- KABI_RESERVE(7) +- KABI_RESERVE(8) + }; + + /* +-- +2.25.1 + diff --git a/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch new file mode 100644 index 0000000..1d3c32f --- /dev/null +++ b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch @@ -0,0 +1,40 @@ +From e68e6e3cf90ec8fb7893057c768d55e83855aaa0 Mon Sep 17 00:00:00 2001 +From: Li Lingfeng +Date: Mon, 16 Dec 2024 20:15:25 +0800 +Subject: [PATCH 03/17] nfs: fix the loss of superblock's initialized flags + +hulk inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IB42W1 + +-------------------------------- + +Commit 573573887e0b ("nfs: pass flags to second superblock") directly +assigns fc->sb_flags to dentry->d_sb->s_flags, which will cause the loss +of the initialized flags in dentry->d_sb->s_flags. + +Fix it by just passing SB_RDONLY from fc->sb_flags to +dentry->d_sb->s_flags. + +Fixes: 573573887e0b ("nfs: pass flags to second superblock") +Signed-off-by: Li Lingfeng +--- + fs/nfs/nfs4super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c +index bb13894ad152..e87f878178f3 100644 +--- a/fs/nfs/nfs4super.c ++++ b/fs/nfs/nfs4super.c +@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server, + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- dentry->d_sb->s_flags = fc->sb_flags; ++ dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY); + fc->root = dentry; + return 0; + } +-- +2.25.1 + diff --git a/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch new file mode 100644 index 0000000..f9c3ab2 --- /dev/null +++ b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch @@ -0,0 +1,61 @@ +From 844a44e5a21be8062fd0c120a75e9ecf97427ae8 Mon Sep 17 00:00:00 2001 +From: hanliyang +Date: Mon, 16 Dec 2024 20:44:36 +0800 +Subject: [PATCH 04/17] x86/config: Enable CONFIG_CMA by default in + openeuler_defconfig + +hygon inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI +CVE: NA + +--------------------------- + +Enable CONFIG_CMA will change kabi. + +Enable CONFIG_CMA will also enable CONFIG_DMA_CMA. + +Signed-off-by: hanliyang +--- + arch/x86/configs/openeuler_defconfig | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig +index 8e8542796a13..adfaef0cb10c 100644 +--- a/arch/x86/configs/openeuler_defconfig ++++ b/arch/x86/configs/openeuler_defconfig +@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y + CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y + CONFIG_USE_PERCPU_NUMA_NODE_ID=y + CONFIG_HAVE_SETUP_PER_CPU_AREA=y +-# CONFIG_CMA is not set ++CONFIG_CMA=y ++# CONFIG_CMA_DEBUG is not set ++# CONFIG_CMA_DEBUGFS is not set ++# CONFIG_CMA_SYSFS is not set ++CONFIG_CMA_AREAS=19 + CONFIG_MEM_SOFT_DIRTY=y + CONFIG_GENERIC_EARLY_IOREMAP=y + CONFIG_DEFERRED_STRUCT_PAGE_INIT=y +@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y + CONFIG_SWIOTLB=y + # CONFIG_SWIOTLB_DYNAMIC is not set + CONFIG_DMA_COHERENT_POOL=y ++CONFIG_DMA_CMA=y ++# CONFIG_DMA_NUMA_CMA is not set ++ ++# ++# Default contiguous memory area size: ++# ++CONFIG_CMA_SIZE_MBYTES=0 ++CONFIG_CMA_SIZE_SEL_MBYTES=y ++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set ++# CONFIG_CMA_SIZE_SEL_MIN is not set ++# CONFIG_CMA_SIZE_SEL_MAX is not set ++CONFIG_CMA_ALIGNMENT=8 + # CONFIG_DMA_API_DEBUG is not set + # CONFIG_DMA_MAP_BENCHMARK is not set + CONFIG_SGL_ALLOC=y +-- +2.25.1 + diff --git a/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch new file mode 100644 index 0000000..79f223e --- /dev/null +++ b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch @@ -0,0 +1,35 @@ +From f0e6b8ca2a5b0bc1347906ff6b80422c4c9878b2 Mon Sep 17 00:00:00 2001 +From: hanliyang +Date: Mon, 16 Dec 2024 20:52:08 +0800 +Subject: [PATCH 05/17] x86/Kconfig: Select CONFIG_CMA if CONFIG_HYGON_CSV=y + +hygon inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI +CVE: NA + +--------------------------- + +The Hygon CSV3 use CMA to manage CSV3 guest's private memory. If the +CONFIG_HYGON_CSV is enabled, then enable CONFIG_CMA automatically. + +Signed-off-by: hanliyang +--- + arch/x86/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index fcd0c3b2065d..a6bbe6029121 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2075,6 +2075,7 @@ config HYGON_CSV + bool "Hygon secure virtualization CSV support" + default y + depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT ++ select CONFIG_CMA + help + Hygon CSV integrates secure processor, memory encryption and + memory isolation to provide the ability to protect guest's private +-- +2.25.1 + diff --git a/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch new file mode 100644 index 0000000..a07a0a5 --- /dev/null +++ b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch @@ -0,0 +1,60 @@ +From 44c5a161852ac117a94ed7748784aecaab552b47 Mon Sep 17 00:00:00 2001 +From: Kuniyuki Iwashima +Date: Tue, 17 Dec 2024 16:33:23 +0800 +Subject: [PATCH 06/17] tcp: Fix use-after-free of nreq in + reqsk_timer_handler(). + +stable inclusion +from stable-v6.6.64 +commit 65ed89cad1f57034c256b016e89e8c0a4ec7c65b +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBA6RL +CVE: NA + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65ed89cad1f57034c256b016e89e8c0a4ec7c65b + +------------------------------------------------- + +[ Upstream commit c31e72d021db2714df03df6c42855a1db592716c ] + +The cited commit replaced inet_csk_reqsk_queue_drop_and_put() with +__inet_csk_reqsk_queue_drop() and reqsk_put() in reqsk_timer_handler(). + +Then, oreq should be passed to reqsk_put() instead of req; otherwise +use-after-free of nreq could happen when reqsk is migrated but the +retry attempt failed (e.g. due to timeout). + +Let's pass oreq to reqsk_put(). + +Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().") +Reported-by: Liu Jian +Closes: https://lore.kernel.org/netdev/1284490f-9525-42ee-b7b8-ccadf6606f6d@huawei.com/ +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Vadim Fedorenko +Reviewed-by: Liu Jian +Reviewed-by: Eric Dumazet +Reviewed-by: Martin KaFai Lau +Link: https://patch.msgid.link/20241123174236.62438-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +Signed-off-by: Liu Jian +--- + net/ipv4/inet_connection_sock.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index ca8cc0988b61..bd032ac2376e 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t) + + drop: + __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); +- reqsk_put(req); ++ reqsk_put(oreq); + } + + static bool reqsk_queue_hash_req(struct request_sock *req, +-- +2.25.1 + diff --git a/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch new file mode 100644 index 0000000..9a95845 --- /dev/null +++ b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch @@ -0,0 +1,63 @@ +From c189729809e4c7a6298126a76db608da2b571240 Mon Sep 17 00:00:00 2001 +From: Pu Lehui +Date: Wed, 18 Dec 2024 06:24:00 +0000 +Subject: [PATCH 08/17] bpf: Add kabi reserve padding for uapi struct + bpf_link_info + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC248 + +-------------------------------- + +Add kabi reserve padding for uapi struct bpf_link_info + +Signed-off-by: Pu Lehui +--- + include/uapi/linux/bpf.h | 9 +++++++++ + tools/include/uapi/linux/bpf.h | 9 +++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h +index 482647774bf5..a660cb68c853 100644 +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -6573,6 +6573,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { +diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h +index c112c6f7c766..9b302242be6c 100644 +--- a/tools/include/uapi/linux/bpf.h ++++ b/tools/include/uapi/linux/bpf.h +@@ -6576,6 +6576,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { +-- +2.25.1 + diff --git a/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch new file mode 100644 index 0000000..43e830c --- /dev/null +++ b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch @@ -0,0 +1,38 @@ +From bbfb8fd7b1297acf7769a814f3fbf919afd391dc Mon Sep 17 00:00:00 2001 +From: Zhang Zekun +Date: Wed, 18 Dec 2024 14:43:35 +0800 +Subject: [PATCH 09/17] iommu: Reserve extra KABI entry for struct iopf_group + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBRHP + +--------------------------------------------------------------- + +The list_head entry in iopf_group has been moved to iopf_group_extend +for KABI compatibility and the lack of KABI reserve entry. Reserve extra +kabi entry for future usage. + +Signed-off-by: Zhang Zekun +--- + include/linux/iommu.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/include/linux/iommu.h b/include/linux/iommu.h +index bb463cb96a44..83ec4bf9809e 100644 +--- a/include/linux/iommu.h ++++ b/include/linux/iommu.h +@@ -155,6 +155,10 @@ struct iopf_group { + KABI_USE(2, u32 cookie) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct iopf_group_extend { +-- +2.25.1 + diff --git a/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch new file mode 100644 index 0000000..371e3af --- /dev/null +++ b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch @@ -0,0 +1,45 @@ +From 1cb26ea1471efb775f2aa141863e82efead07d61 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Wed, 18 Dec 2024 15:21:56 +0800 +Subject: [PATCH 10/17] seq_file: kabi: KABI reservation for seq_file + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC34X + +---------------------------------------------------------------------- + + structure size reserves reserved + seq_file 120 1 128 + seq_operations 32 1 40 + +Signed-off-by: Baokun Li +--- + include/linux/seq_file.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h +index 234bcdb1fba4..cf4a2258df85 100644 +--- a/include/linux/seq_file.h ++++ b/include/linux/seq_file.h +@@ -27,6 +27,8 @@ struct seq_file { + int poll_event; + const struct file *file; + void *private; ++ ++ KABI_RESERVE(1) + }; + + struct seq_operations { +@@ -34,6 +36,8 @@ struct seq_operations { + void (*stop) (struct seq_file *m, void *v); + void * (*next) (struct seq_file *m, void *v, loff_t *pos); + int (*show) (struct seq_file *m, void *v); ++ ++ KABI_RESERVE(1) + }; + + #define SEQ_SKIP 1 +-- +2.25.1 + diff --git a/0015-statx-kabi-KABI-reservation-for-kstat.patch b/0015-statx-kabi-KABI-reservation-for-kstat.patch new file mode 100644 index 0000000..12b7151 --- /dev/null +++ b/0015-statx-kabi-KABI-reservation-for-kstat.patch @@ -0,0 +1,38 @@ +From ed5b59b6c40d2563994c1f7b5a1321affb490d45 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Wed, 18 Dec 2024 15:23:01 +0800 +Subject: [PATCH 11/17] statx: kabi: KABI reservation for kstat + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC24E + +---------------------------------------------------------------------- + + structure size reserves reserved mainline + kstat 160 4 192 184 + +Signed-off-by: Baokun Li +--- + include/linux/stat.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/include/linux/stat.h b/include/linux/stat.h +index 52150570d37a..d342e89b7aaa 100644 +--- a/include/linux/stat.h ++++ b/include/linux/stat.h +@@ -53,6 +53,11 @@ struct kstat { + u32 dio_mem_align; + u32 dio_offset_align; + u64 change_cookie; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ +-- +2.25.1 + diff --git a/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch new file mode 100644 index 0000000..ca2556d --- /dev/null +++ b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch @@ -0,0 +1,200 @@ +From 30f7b1506ec798949e6ce99c023780b0306845c9 Mon Sep 17 00:00:00 2001 +From: "Matthew Wilcox (Oracle)" +Date: Wed, 18 Dec 2024 15:31:44 +0800 +Subject: [PATCH 12/17] fs: Allow fine-grained control of folio sizes + +mainline inclusion +from mainline-v6.10-rc2 +commit 84429b675bcfd2a518ae167ee4661cdf7539aa7d +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC20Q + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=84429b675bcfd2a518ae167ee4661cdf7539aa7d + +-------------------------------- + +We need filesystems to be able to communicate acceptable folio sizes +to the pagecache for a variety of uses (e.g. large block sizes). +Support a range of folio sizes between order-0 and order-31. + +Signed-off-by: Matthew Wilcox (Oracle) +Co-developed-by: Pankaj Raghav +Signed-off-by: Pankaj Raghav +Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com +Tested-by: David Howells +Reviewed-by: Hannes Reinecke +Reviewed-by: Darrick J. Wong +Reviewed-by: Daniel Gomez +Signed-off-by: Christian Brauner +Conflicts: + include/linux/pagemap.h + mm/filemap.c +[Conflicts due to not merged 83ee0e20fd9f ("filemap: support disable large +folios on active inode")] +Signed-off-by: Long Li +--- + include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++------ + mm/readahead.c | 4 +- + 2 files changed, 79 insertions(+), 15 deletions(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 429627abfef4..e44e377661f2 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -203,12 +203,21 @@ enum mapping_flags { + AS_EXITING = 4, /* final truncate in progress */ + /* writeback related tags are not used */ + AS_NO_WRITEBACK_TAGS = 5, +- AS_LARGE_FOLIO_SUPPORT = 6, +- AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ +- AS_STABLE_WRITES, /* must wait for writeback before modifying ++ AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ ++ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying + folio contents */ ++ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ ++ /* Bits 16-25 are used for FOLIO_ORDER */ ++ AS_FOLIO_ORDER_BITS = 5, ++ AS_FOLIO_ORDER_MIN = 16, ++ AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, + }; + ++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) ++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) ++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) ++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) ++ + /** + * mapping_set_error - record a writeback error in the address_space + * @mapping: the mapping in which an error should be set +@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) + #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) + ++/* ++ * mapping_set_folio_order_range() - Set the orders supported by a file. ++ * @mapping: The address space of the file. ++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). ++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). ++ * ++ * The filesystem should call this function in its inode constructor to ++ * indicate which base size (min) and maximum size (max) of folio the VFS ++ * can use to cache the contents of the file. This should only be used ++ * if the filesystem needs special handling of folio sizes (ie there is ++ * something the core cannot know). ++ * Do not tune it based on, eg, i_size. ++ * ++ * Context: This should not be called while the inode is active as it ++ * is non-atomic. ++ */ ++static inline void mapping_set_folio_order_range(struct address_space *mapping, ++ unsigned int min, ++ unsigned int max) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return; ++ ++ if (min > MAX_PAGECACHE_ORDER) ++ min = MAX_PAGECACHE_ORDER; ++ ++ if (max > MAX_PAGECACHE_ORDER) ++ max = MAX_PAGECACHE_ORDER; ++ ++ if (max < min) ++ max = min; ++ ++ mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | ++ (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); ++} ++ ++static inline void mapping_set_folio_min_order(struct address_space *mapping, ++ unsigned int min) ++{ ++ mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); ++} ++ + /** + * mapping_set_large_folios() - Indicate the file supports large folios. +- * @mapping: The file. ++ * @mapping: The address space of the file. + * + * The filesystem should call this function in its inode constructor to + * indicate that the VFS can use large folios to cache the contents of +@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + */ + static inline void mapping_set_large_folios(struct address_space *mapping) + { +- __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); ++} ++ ++static inline unsigned int ++mapping_max_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; ++} ++ ++static inline unsigned int ++mapping_min_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; + } + + /** +@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping) + static inline void mapping_clear_large_folios(struct address_space *mapping) + { + WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock)); +- __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, 0); + } + + /* +@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping) + */ + static inline bool mapping_large_folio_support(struct address_space *mapping) + { +- /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ ++ /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + +- return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && +- test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ return mapping_max_folio_order(mapping) > 0; + } + + /* Return the maximum folio size for this pagecache mapping, in bytes. */ +-static inline size_t mapping_max_folio_size(struct address_space *mapping) ++static inline size_t mapping_max_folio_size(const struct address_space *mapping) + { +- if (mapping_large_folio_support(mapping)) +- return PAGE_SIZE << MAX_PAGECACHE_ORDER; +- return PAGE_SIZE; ++ return PAGE_SIZE << mapping_max_folio_order(mapping); + } + + static inline int filemap_nr_thps(struct address_space *mapping) +diff --git a/mm/readahead.c b/mm/readahead.c +index 438f142a3e74..c13c130efcca 100644 +--- a/mm/readahead.c ++++ b/mm/readahead.c +@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl, + + limit = min(limit, index + ra->size - 1); + +- if (new_order < MAX_PAGECACHE_ORDER) ++ if (new_order < mapping_max_folio_order(mapping)) + new_order += 2; + +- new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); ++ new_order = min(mapping_max_folio_order(mapping), new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + + /* See comment in page_cache_ra_unbounded() */ +-- +2.25.1 + diff --git a/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch new file mode 100644 index 0000000..ebe3ba0 --- /dev/null +++ b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch @@ -0,0 +1,68 @@ +From 8c8766f9500b9ffdb907d23269aa888d0632e68c Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:10:59 +0000 +Subject: [PATCH 13/17] Revert "cgroup: fix uaf when proc_cpuset_show" + +hulk inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IA9YQ9 + +-------------------------------- + +To keep the same with the mainline and backport the lts patch. +This reverts commit 24c448de81d48ad08925dda9869bcf535a3258b8. + +Fixes: 24c448de81d4 ("cgroup: fix uaf when proc_cpuset_show") +Signed-off-by: Chen Ridong +--- + kernel/cgroup/cpuset.c | 24 ------------------------ + 1 file changed, 24 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 2c9e50f09fc1..140dfb5ad3fc 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -5185,7 +5185,6 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + char *buf; + struct cgroup_subsys_state *css; + int retval; +- struct cgroup *root_cgroup = NULL; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); +@@ -5193,32 +5192,9 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + goto out; + + css = task_get_css(tsk, cpuset_cgrp_id); +- rcu_read_lock(); +- /* +- * When the cpuset subsystem is mounted on the legacy hierarchy, +- * the top_cpuset.css->cgroup does not hold a reference count of +- * cgroup_root.cgroup. This makes accessing css->cgroup very +- * dangerous because when the cpuset subsystem is remounted to the +- * default hierarchy, the cgroup_root.cgroup that css->cgroup points +- * to will be released, leading to a UAF issue. To avoid this problem, +- * get the reference count of top_cpuset.css->cgroup first. +- * +- * This is ugly!! +- */ +- if (css == &top_cpuset.css) { +- root_cgroup = css->cgroup; +- if (!css_tryget_online(&root_cgroup->self)) { +- rcu_read_unlock(); +- retval = -EBUSY; +- goto out_free; +- } +- } +- rcu_read_unlock(); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); +- if (root_cgroup) +- css_put(&root_cgroup->self); + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) +-- +2.25.1 + diff --git a/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch new file mode 100644 index 0000000..0c54088 --- /dev/null +++ b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch @@ -0,0 +1,145 @@ +From 7b6abe1742cbfedea405f03fcf7fc88cacb2a205 Mon Sep 17 00:00:00 2001 +From: Yafang Shao +Date: Wed, 18 Dec 2024 08:11:00 +0000 +Subject: [PATCH 14/17] cgroup: Make operations on the cgroup root_list RCU + safe +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +stable inclusion +from stable-v6.6.47 +commit dd9542ae7c7ca82ed2d7c185754ba9026361f6bc +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dd9542ae7c7ca82ed2d7c185754ba9026361f6bc + +-------------------------------- + +commit d23b5c577715892c87533b13923306acc6243f93 upstream. + +At present, when we perform operations on the cgroup root_list, we must +hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, +we can make operations on this list RCU-safe, eliminating the need to hold +the cgroup_mutex during traversal. Modifications to the list only occur in +the cgroup root setup and destroy paths, which should be infrequent in a +production environment. In contrast, traversal may occur frequently. +Therefore, making it RCU-safe would be beneficial. + +Signed-off-by: Yafang Shao +Signed-off-by: Tejun Heo +To: Michal Koutný +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 1 + + kernel/cgroup/cgroup-internal.h | 3 ++- + kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- + 3 files changed, 19 insertions(+), 8 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 6e3227a688de..05ece896af7d 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -591,6 +591,7 @@ struct cgroup_root { + + /* A list running through the active hierarchies */ + struct list_head root_list; ++ struct rcu_head rcu; + + /* Hierarchy-specific flags */ + unsigned int flags; +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index 96a9bd2c26f0..f5fb12890645 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; + + /* iterate across the hierarchies */ + #define for_each_root(root) \ +- list_for_each_entry((root), &cgroup_roots, root_list) ++ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ ++ lockdep_is_held(&cgroup_mutex)) + + /** + * for_each_subsys - iterate all enabled cgroup subsystems +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index 52fe6ba2fefd..c26a9b3a3576 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) + + void cgroup_free_root(struct cgroup_root *root) + { +- kfree(root); ++ kfree_rcu(root, rcu); + } + + static void cgroup_destroy_root(struct cgroup_root *root) +@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) + spin_unlock_irq(&css_set_lock); + + if (!list_empty(&root->root_list)) { +- list_del(&root->root_list); ++ list_del_rcu(&root->root_list); + cgroup_root_count--; + } + +@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + } + } + +- BUG_ON(!res_cgroup); ++ /* ++ * If cgroup_mutex is not held, the cgrp_cset_link will be freed ++ * before we remove the cgroup root from the root_list. Consequently, ++ * when accessing a cgroup root, the cset_link may have already been ++ * freed, resulting in a NULL res_cgroup. However, by holding the ++ * cgroup_mutex, we ensure that res_cgroup can't be NULL. ++ * If we don't hold cgroup_mutex in the caller, we must do the NULL ++ * check. ++ */ + return res_cgroup; + } + +@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) + static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) + { +- lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_lock); + + return __cset_cgroup_from_root(cset, root); +@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + + /* + * Return the cgroup for "task" from the given hierarchy. Must be +- * called with cgroup_mutex and css_set_lock held. ++ * called with css_set_lock held to prevent task's groups from being modified. ++ * Must be called with either cgroup_mutex or rcu read lock to prevent the ++ * cgroup root from being destroyed. + */ + struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) +@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) + struct cgroup_root *root = ctx->root; + struct cgroup *cgrp = &root->cgrp; + +- INIT_LIST_HEAD(&root->root_list); ++ INIT_LIST_HEAD_RCU(&root->root_list); + atomic_set(&root->nr_cgrps, 1); + cgrp->root = root; + init_cgroup_housekeeping(cgrp); +@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ +- list_add(&root->root_list, &cgroup_roots); ++ list_add_rcu(&root->root_list, &cgroup_roots); + cgroup_root_count++; + + /* +-- +2.25.1 + diff --git a/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch new file mode 100644 index 0000000..45d7802 --- /dev/null +++ b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch @@ -0,0 +1,84 @@ +From 4363688e9b49bde3cce7b2ea1882f3d44d1f5289 Mon Sep 17 00:00:00 2001 +From: Waiman Long +Date: Wed, 18 Dec 2024 08:11:01 +0000 +Subject: [PATCH 15/17] cgroup: Move rcu_head up near the top of cgroup_root +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +stable inclusion +from stable-v6.6.47 +commit f3c60ab676bb62e01d004d5b1cf2963a296c8e6a +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f3c60ab676bb62e01d004d5b1cf2963a296c8e6a + +-------------------------------- + +commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream. + +Commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU +safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() +for freeing the cgroup_root. + +The current implementation of kvfree_rcu(), however, has the limitation +that the offset of the rcu_head structure within the larger data +structure must be less than 4096 or the compilation will fail. See the +macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h +for more information. + +By putting rcu_head below the large cgroup structure, any change to the +cgroup structure that makes it larger run the risk of causing build +failure under certain configurations. Commit 77070eeb8821 ("cgroup: +Avoid false cacheline sharing of read mostly rstat_cpu") happens to be +the last straw that breaks it. Fix this problem by moving the rcu_head +structure up before the cgroup structure. + +Fixes: 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe") +Reported-by: Stephen Rothwell +Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ +Signed-off-by: Waiman Long +Acked-by: Yafang Shao +Reviewed-by: Yosry Ahmed +Reviewed-by: Michal Koutný +Signed-off-by: Tejun Heo +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + include/linux/cgroup-defs.h +[Context is mismatched for wait_queue_head_t wait was merged] +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 05ece896af7d..8eb518ce87a1 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -573,6 +573,10 @@ struct cgroup_root { + /* Unique id for this hierarchy. */ + int hierarchy_id; + ++ /* A list running through the active hierarchies */ ++ struct list_head root_list; ++ struct rcu_head rcu; /* Must be near the top */ ++ + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. cgrp->ancestors[0] will be used overflowing into the +@@ -589,10 +593,6 @@ struct cgroup_root { + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + +- /* A list running through the active hierarchies */ +- struct list_head root_list; +- struct rcu_head rcu; +- + /* Hierarchy-specific flags */ + unsigned int flags; + +-- +2.25.1 + diff --git a/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch new file mode 100644 index 0000000..c528ff3 --- /dev/null +++ b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch @@ -0,0 +1,110 @@ +From 724b6581cd8b49962e3add6e8795423f2c1390f8 Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:11:02 +0000 +Subject: [PATCH 16/17] cgroup/cpuset: Prevent UAF in proc_cpuset_show() + +stable inclusion +from stable-v6.6.44 +commit 96226fbed566f3f686f53a489a29846f2d538080 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=96226fbed566f3f686f53a489a29846f2d538080 + +-------------------------------- + +[ Upstream commit 1be59c97c83ccd67a519d8a49486b3a8a73ca28a ] + +An UAF can happen when /proc/cpuset is read as reported in [1]. + +This can be reproduced by the following methods: +1.add an mdelay(1000) before acquiring the cgroup_lock In the + cgroup_path_ns function. +2.$cat /proc//cpuset repeatly. +3.$mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset/ +$umount /sys/fs/cgroup/cpuset/ repeatly. + +The race that cause this bug can be shown as below: + +(umount) | (cat /proc//cpuset) +css_release | proc_cpuset_show +css_release_work_fn | css = task_get_css(tsk, cpuset_cgrp_id); +css_free_rwork_fn | cgroup_path_ns(css->cgroup, ...); +cgroup_destroy_root | mutex_lock(&cgroup_mutex); +rebind_subsystems | +cgroup_free_root | + | // cgrp was freed, UAF + | cgroup_path_ns_locked(cgrp,..); + +When the cpuset is initialized, the root node top_cpuset.css.cgrp +will point to &cgrp_dfl_root.cgrp. In cgroup v1, the mount operation will +allocate cgroup_root, and top_cpuset.css.cgrp will point to the allocated +&cgroup_root.cgrp. When the umount operation is executed, +top_cpuset.css.cgrp will be rebound to &cgrp_dfl_root.cgrp. + +The problem is that when rebinding to cgrp_dfl_root, there are cases +where the cgroup_root allocated by setting up the root for cgroup v1 +is cached. This could lead to a Use-After-Free (UAF) if it is +subsequently freed. The descendant cgroups of cgroup v1 can only be +freed after the css is released. However, the css of the root will never +be released, yet the cgroup_root should be freed when it is unmounted. +This means that obtaining a reference to the css of the root does +not guarantee that css.cgrp->root will not be freed. + +Fix this problem by using rcu_read_lock in proc_cpuset_show(). +As cgroup_root is kfree_rcu after commit 331654dc5f40 +("cgroup: Make operations on the cgroup root_list RCU safe"), +css->cgroup won't be freed during the critical section. +To call cgroup_path_ns_locked, css_set_lock is needed, so it is safe to +replace task_get_css with task_css. + +[1] https://syzkaller.appspot.com/bug?extid=9b1ff7be974a403aa4cd + +Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") +Signed-off-by: Chen Ridong +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin + +Conflicts: + kernel/cgroup/cpuset.c +[commit 5715456af3e0 ("kernfs: Convert kernfs_path_from_node_locked() +from strlcpy() to strscpy()") was not merged] +Signed-off-by: Chen Ridong +--- + kernel/cgroup/cpuset.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 140dfb5ad3fc..f3cf9b1268e0 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -21,6 +21,7 @@ + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ ++#include "cgroup-internal.h" + + #include + #include +@@ -5191,10 +5192,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + if (!buf) + goto out; + +- css = task_get_css(tsk, cpuset_cgrp_id); +- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, +- current->nsproxy->cgroup_ns); +- css_put(css); ++ rcu_read_lock(); ++ spin_lock_irq(&css_set_lock); ++ css = task_css(tsk, cpuset_cgrp_id); ++ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, ++ current->nsproxy->cgroup_ns); ++ spin_unlock_irq(&css_set_lock); ++ rcu_read_unlock(); ++ + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) +-- +2.25.1 + diff --git a/0021-cgroup-add-more-reserve-kabi.patch b/0021-cgroup-add-more-reserve-kabi.patch new file mode 100644 index 0000000..5c0ed08 --- /dev/null +++ b/0021-cgroup-add-more-reserve-kabi.patch @@ -0,0 +1,90 @@ +From d68991f87f738657074d93a1ae8ccf865f40b65a Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:11:03 +0000 +Subject: [PATCH 17/17] cgroup: add more reserve kabi + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I8SA3O + +-------------------------------- + +Reserve KABI for future feature development. + +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 7 +++++++ + include/linux/memcontrol.h | 8 ++++++++ + kernel/cgroup/cpuset.c | 5 ----- + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 8eb518ce87a1..f3fd0407d346 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -325,6 +325,8 @@ struct cgroup_base_stat { + #ifdef CONFIG_SCHED_CORE + u64 forceidle_sum; + #endif ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + /* +@@ -555,6 +557,9 @@ struct cgroup { + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + /* All ancestors including self */ + struct cgroup *ancestors[]; + }; +@@ -606,6 +611,8 @@ struct cgroup_root { + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) + }; + + /* +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index b2a80e089a0a..abe236201e68 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -429,6 +429,14 @@ struct mem_cgroup { + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) ++ KABI_RESERVE(9) ++ KABI_RESERVE(10) ++ KABI_RESERVE(11) ++ KABI_RESERVE(12) ++ KABI_RESERVE(13) ++ KABI_RESERVE(14) ++ KABI_RESERVE(15) ++ KABI_RESERVE(16) + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index f3cf9b1268e0..7ea0a6d00519 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -211,11 +211,6 @@ struct cpuset { + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; +- +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) + }; + + /* +-- +2.25.1 + diff --git a/0022-14223.patch b/0022-14223.patch new file mode 100644 index 0000000..b103427 --- /dev/null +++ b/0022-14223.patch @@ -0,0 +1,80 @@ +From f8cb61566576a623971d5cc8dd3cd6229e787e30 Mon Sep 17 00:00:00 2001 +From: Zhang Changzhong +Date: Wed, 18 Dec 2024 17:50:29 +0800 +Subject: [PATCH] kabi: net: reserve space for xdp subsystem related structure + +hulk inclusion +category: other +bugzilla: https://gitee.com/openeuler/kernel/issues/I8OWRC + +---------------------------------------------------- + +Reserve some fields beforehand for xdp framework related structures +prone to change. + +Signed-off-by: Zhang Changzhong +--- + include/net/xdp.h | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/include/net/xdp.h b/include/net/xdp.h +index c283668458ca..9b9c7dc25eeb 100644 +--- a/include/net/xdp.h ++++ b/include/net/xdp.h +@@ -54,6 +54,9 @@ enum xdp_mem_type { + struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + struct page_pool; +@@ -74,6 +77,9 @@ struct xdp_rxq_info { + + struct xdp_txq_info { + struct net_device *dev; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + enum xdp_buff_flags { +@@ -92,6 +98,11 @@ struct xdp_buff { + struct xdp_txq_info *txq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +@@ -181,6 +192,11 @@ struct xdp_frame { + struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +@@ -198,6 +214,9 @@ struct xdp_frame_bulk { + int count; + void *xa; + void *q[XDP_BULK_QUEUE_SIZE]; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +-- +Gitee + diff --git a/0023-14224.patch b/0023-14224.patch new file mode 100644 index 0000000..62ba017 --- /dev/null +++ b/0023-14224.patch @@ -0,0 +1,85 @@ +From a2bbb3a7e3d30f5efc443fa17fcfe20fdd5a98d5 Mon Sep 17 00:00:00 2001 +From: Dong Chenchen +Date: Wed, 18 Dec 2024 17:15:36 +0800 +Subject: [PATCH] net/kabi: Reserve space for net structures + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC1RH + +-------------------------------- + +Reserve some fields beforehand for net subsystem related +structures prone to change. + +Signed-off-by: Dong Chenchen +--- + include/net/flow.h | 2 ++ + include/net/netns/netfilter.h | 2 ++ + include/net/netns/xfrm.h | 2 ++ + include/net/xfrm.h | 4 ++++ + 4 files changed, 10 insertions(+) + +diff --git a/include/net/flow.h b/include/net/flow.h +index 0cc5f2ef1000..72d2ea2374ba 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -46,6 +46,8 @@ struct flowi_common { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + union flowi_uli { +diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h +index 4b77a9b031b6..963588269637 100644 +--- a/include/net/netns/netfilter.h ++++ b/include/net/netns/netfilter.h +@@ -34,5 +34,7 @@ struct netns_nf { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + #endif +diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h +index a0c1359cc7eb..af7f20ef4823 100644 +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -87,6 +87,8 @@ struct netns_xfrm { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + #endif +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index c875faf98492..b9dec5f9c973 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -294,6 +294,8 @@ struct xfrm_state { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xs_net(struct xfrm_state *x) +@@ -562,6 +564,8 @@ struct xfrm_policy { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xp_net(const struct xfrm_policy *xp) +-- +Gitee + diff --git a/0024-14225.patch b/0024-14225.patch new file mode 100644 index 0000000..32a1037 --- /dev/null +++ b/0024-14225.patch @@ -0,0 +1,154 @@ +From 279803fa98908bd367cec04ae2600c15764fb977 Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:31 +0000 +Subject: [PATCH 1/3] kabi: reserve space for perf_event.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for perf_event.h + +Signed-off-by: Luo Gengkun +--- + include/linux/perf_event.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 89f2a02db563..fe692e9bd0b2 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context { + struct hrtimer hrtimer; + ktime_t hrtimer_interval; + unsigned int hrtimer_active; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + /** +@@ -1031,6 +1039,14 @@ struct perf_cpu_context { + int heap_size; + struct perf_event **heap; + struct perf_event *heap_default[2]; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct perf_output_handle { +-- +Gitee + + +From 078ad81846b81844eb98f90eee57c06954715c8d Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:32 +0000 +Subject: [PATCH 2/3] kabi: reserve space for internal.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for internal.h + +Signed-off-by: Luo Gengkun +--- + kernel/events/internal.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/kernel/events/internal.h b/kernel/events/internal.h +index d2e6e6144c54..d1ffa00b91b6 100644 +--- a/kernel/events/internal.h ++++ b/kernel/events/internal.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + /* Buffer handling */ + +@@ -54,6 +55,15 @@ struct perf_buffer { + void **aux_pages; + void *aux_priv; + ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) ++ + struct perf_event_mmap_page *user_page; + void *data_pages[]; + }; +-- +Gitee + + +From 59a2a3e8b1c35d9e0bde08cd2e6f01f1c12d384b Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:33 +0000 +Subject: [PATCH 3/3] kabi: reserve space for uprobes.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for uprobes.h + +Signed-off-by: Luo Gengkun +--- + include/linux/uprobes.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h +index f46e0ca0169c..86d0868b584a 100644 +--- a/include/linux/uprobes.h ++++ b/include/linux/uprobes.h +@@ -47,6 +47,7 @@ struct uprobe_consumer { + + #ifdef CONFIG_UPROBES + #include ++#include + + enum uprobe_task_state { + UTASK_RUNNING, +@@ -78,6 +79,14 @@ struct uprobe_task { + + struct return_instance *return_instances; + unsigned int depth; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct return_instance { +-- +Gitee + diff --git a/0026-14227.patch b/0026-14227.patch new file mode 100644 index 0000000..4caa739 --- /dev/null +++ b/0026-14227.patch @@ -0,0 +1,3464 @@ +From 3c8ff7deba8ed905fb4c3d05ccccdecb6000b7d4 Mon Sep 17 00:00:00 2001 +From: Chengming Zhou +Date: Wed, 18 Dec 2024 17:51:06 +0800 +Subject: [PATCH 01/14] mm/zswap: invalidate zswap entry when swap entry free + +mainline inclusion +from mainline-v6.9-rc1 +commit 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 + +-------------------------------- + +During testing I found there are some times the zswap_writeback_entry() +return -ENOMEM, which is not we expected: + +bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' +@[-12]: 1563 +@[0]: 277221 + +The reason is that __read_swap_cache_async() return NULL because +swapcache_prepare() failed. The reason is that we won't invalidate zswap +entry when swap entry freed to the per-cpu pool, these zswap entries are +still on the zswap tree and lru list. + +This patch moves the invalidation ahead to when swap entry freed to the +per-cpu pool, since there is no any benefit to leave trashy zswap entry on +the tree and lru list. + +With this patch: +bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' +@[0]: 259744 + +Note: large folio can't have zswap entry for now, so don't bother +to add zswap entry invalidation in the large folio swap free path. + +Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com +Signed-off-by: Chengming Zhou +Reviewed-by: Nhat Pham +Acked-by: Johannes Weiner +Acked-by: Yosry Ahmed +Signed-off-by: Andrew Morton +Conflicts: + include/linux/zswap.h + mm/zswap.c +[ Context conflict. ] +Signed-off-by: Liu Shixin +--- + include/linux/zswap.h | 4 ++-- + mm/swap_slots.c | 4 ++++ + mm/swapfile.c | 1 - + mm/zswap.c | 5 +++-- + 4 files changed, 9 insertions(+), 5 deletions(-) + +diff --git a/include/linux/zswap.h b/include/linux/zswap.h +index 2a60ce39cfde..a13d2d2d9131 100644 +--- a/include/linux/zswap.h ++++ b/include/linux/zswap.h +@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages; + + bool zswap_store(struct folio *folio); + bool zswap_load(struct folio *folio); +-void zswap_invalidate(int type, pgoff_t offset); ++void zswap_invalidate(swp_entry_t swp); + void zswap_swapon(int type); + void zswap_swapoff(int type); + +@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio) + return false; + } + +-static inline void zswap_invalidate(int type, pgoff_t offset) {} ++static inline void zswap_invalidate(swp_entry_t swp) {} + static inline void zswap_swapon(int type) {} + static inline void zswap_swapoff(int type) {} + +diff --git a/mm/swap_slots.c b/mm/swap_slots.c +index 7af3b93d4c8c..5579eed7065f 100644 +--- a/mm/swap_slots.c ++++ b/mm/swap_slots.c +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); + #ifdef CONFIG_MEMCG_SWAP_QOS +@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry) + { + struct swap_slots_cache *cache; + ++ /* Large folio swap slot is not covered. */ ++ zswap_invalidate(entry); ++ + cache = raw_cpu_ptr(&swp_slots); + if (likely(use_swap_slot_cache && cache->slots_ret)) { + spin_lock_irq(&cache->free_lock); +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 3af5b6ebb241..30832b85d6c2 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -765,7 +765,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + swap_slot_free_notify = NULL; + while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); +- zswap_invalidate(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; +diff --git a/mm/zswap.c b/mm/zswap.c +index 69681b9173fd..5acda5b906bc 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio) + return ret; + } + +-void zswap_invalidate(int type, pgoff_t offset) ++void zswap_invalidate(swp_entry_t swp) + { +- struct zswap_tree *tree = zswap_trees[type]; ++ pgoff_t offset = swp_offset(swp); ++ struct zswap_tree *tree = zswap_trees[swp_type(swp)]; + struct zswap_entry *entry; + + /* find */ +-- +Gitee + + +From e2f02eacab254e29bd451782950ac6a03de685bd Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:07 +0800 +Subject: [PATCH 02/14] mm: swap: swap cluster switch to double link list +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mainline inclusion +from mainline-v6.12-rc1 +commit 73ed0baae66df50359c876f65f41179d6ebd2716 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73ed0baae66df50359c876f65f41179d6ebd2716 + +-------------------------------- + +Patch series "mm: swap: mTHP swap allocator base on swap cluster order", +v5. + +This is the short term solutions "swap cluster order" listed in my "Swap +Abstraction" discussion slice 8 in the recent LSF/MM conference. + +When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is +introduced, it only allocates the mTHP swap entries from the new empty +cluster list.  It has a fragmentation issue reported by Barry. + +https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/ + +The reason is that all the empty clusters have been exhausted while there +are plenty of free swap entries in the cluster that are not 100% free. + +Remember the swap allocation order in the cluster. Keep track of the per +order non full cluster list for later allocation. + +This series gives the swap SSD allocation a new separate code path from +the HDD allocation. The new allocator use cluster list only and do not +global scan swap_map[] without lock any more. + +This streamline the swap allocation for SSD. The code matches the +execution flow much better. + +User impact: For users that allocate and free mix order mTHP swapping, It +greatly improves the success rate of the mTHP swap allocation after the +initial phase. + +It also performs faster when the swapfile is close to full, because the +allocator can get the non full cluster from a list rather than scanning a +lot of swap_map entries.  + +With Barry's mthp test program V2: + +Without: +$ ./thp_swap_allocator_test -a +Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71% +Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00% +... +Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test -a -s +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test -s +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +With: # with all 0.00% filter out +$ ./thp_swap_allocator_test -a | grep -v "0.00%" +$ # all result are 0.00% + +$ ./thp_swap_allocator_test -a -s | grep -v "0.00%" +./thp_swap_allocator_test -a -s | grep -v "0.00%" +Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33% +Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10% +Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51% +Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72% +Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81% +Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63% +Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90% +Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15% +Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% +Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46% +Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49% +Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75% +Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28% +Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35% +Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45% +Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93% +Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38% +Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% +Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45% +Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64% +Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30% + +$ ./thp_swap_allocator_test +./thp_swap_allocator_test +Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% +Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53% +Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58% +Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34% +Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51% +Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84% +Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91% +Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05% +Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25% +Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74% +Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01% +Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45% +Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98% +Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64% +Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36% +Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02% +Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07% + +$ ./thp_swap_allocator_test -s + ./thp_swap_allocator_test -s +Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% +Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19% +Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05% +Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85% +Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67% +Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18% +Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96% +Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38% +Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11% +Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13% +Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17% +Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84% +Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60% +Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67% +Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24% + +========= +Kernel compile under tmpfs with cgroup memory.max = 470M. +12 core 24 hyperthreading, 32 jobs. 10 Run each group + +SSD swap 10 runs average, 20G swap partition: +With: +user 2929.064 +system 1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27 +1504.47 1531.4 1532.92 1551.57 +real 1441.324 + +Without: +user 2910.872 +system 1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3 +1470.19 1496.32 1544.1 1559.01 +real 1580.822 + +Two zram swap: zram0 3.0G zram1 20G. + +The idea is forcing the zram0 almost full then overflow to zram1: + +With: +user 4320.301 +system 4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06 +4279.85 4285.98 4289.64 4293.13 +real 431.759 + +Without +user 4301.393 +system 4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05 +4389.76 4397.16 4398.23 4403.9 +real 433.979 + +------ more test result from Kaiui ---------- + +Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem: + +System info: 32 Core AMD Zen2, 64G total memory. + +Test 3 times using only 4K pages: +================================= + +With: +----- +1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k +1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k +1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k + +Summary (~4.6% improment of system time): +User: 1839.62 +System: 2443.89: 2465.77 2454.68 2411.21 +Real: 158.88 + +Without: +-------- +1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k +1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k +1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k + +Summary: +User: 1839.78 +System: 2564.22: 2575.95 2555.15 2561.55 +Real: 162.99 + +Test 5 times using enabled all mTHP pages: +========================================== + +With: +----- +1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k +1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k +1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k +1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k +1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k + +Summary (~8.4% improvement of system time): +User: 1803.25 +System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08 +Real: 175.90 + +mTHP swapout status: +/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721 +/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110 +/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365 +/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269 +/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24 +/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341 +/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145 +/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038 +/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737 +/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808 +/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455 +/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010 +/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973 +/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223 +/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348 +/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541 + +Without: +-------- +1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k +1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k +1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k +1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k +1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k + +Summary: +User: 1811.61 +System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40 +Real: 185.356 + +mTHP swapout status: +hugepages-32kB/stats/swpout:35630 +hugepages-32kB/stats/swpout_fallback:1809908 +hugepages-512kB/stats/swpout:523 +hugepages-512kB/stats/swpout_fallback:55235 +hugepages-2048kB/stats/swpout:53 +hugepages-2048kB/stats/swpout_fallback:17264 +hugepages-1024kB/stats/swpout:85 +hugepages-1024kB/stats/swpout_fallback:24979 +hugepages-64kB/stats/swpout:30117 +hugepages-64kB/stats/swpout_fallback:1825399 +hugepages-16kB/stats/swpout:42775 +hugepages-16kB/stats/swpout_fallback:1951123 +hugepages-256kB/stats/swpout:2326 +hugepages-256kB/stats/swpout_fallback:170165 +hugepages-128kB/stats/swpout:17925 +hugepages-128kB/stats/swpout_fallback:1309757 + +This patch (of 9): + +Previously, the swap cluster used a cluster index as a pointer to +construct a custom single link list type "swap_cluster_list". The next +cluster pointer is shared with the cluster->count. It prevents puting the +non free cluster into a list. + +Change the cluster to use the standard double link list instead. This +allows tracing the nonfull cluster in the follow up patch. That way, it +is faster to get to the nonfull cluster of that order. + +Remove the cluster getter/setter for accessing the cluster struct member. + +The list operation is protected by the swap_info_struct->lock. + +Change cluster code to use "struct swap_cluster_info *" to reference the +cluster rather than by using index. That is more consistent with the list +manipulation. It avoids the repeat adding index to the cluser_info. The +code is easier to understand. + +Remove the cluster next pointer is NULL flag, the double link list can +handle the empty list pretty well. + +The "swap_cluster_info" struct is two pointer bigger, because 512 swap +entries share one swap_cluster_info struct, it has very little impact on +the average memory usage per swap entry. For 1TB swapfile, the swap +cluster data structure increases from 8MB to 24MB. + +Other than the list conversion, there is no real function change in this +patch. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Reported-by: Barry Song <21cnbao@gmail.com> +Reviewed-by: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kairui Song +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + include/linux/swap.h +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 25 ++--- + mm/swapfile.c | 226 ++++++++++++------------------------------- + 2 files changed, 71 insertions(+), 180 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index bea0c0f1f640..94e1b6bb04ce 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -255,22 +255,20 @@ enum { + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * +- * The data field stores next cluster if the cluster is free or cluster usage +- * counter otherwise. The flags field determines if a cluster is free. This is +- * protected by swap_info_struct.lock. ++ * The flags field determines if a cluster is free. This is ++ * protected by cluster lock. + */ + struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields +- * and swap_info_struct->swap_map +- * elements correspond to the swap +- * cluster ++ * other than list, and swap_info_struct->swap_map ++ * elements corresponding to the swap cluster. + */ +- unsigned int data:24; +- unsigned int flags:8; ++ u16 count; ++ u8 flags; ++ struct list_head list; + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -295,11 +293,6 @@ struct percpu_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ + }; + +-struct swap_cluster_list { +- struct swap_cluster_info head; +- struct swap_cluster_info tail; +-}; +- + /* + * The in-memory structure used to track swap areas. + */ +@@ -312,7 +305,7 @@ struct swap_info_struct { + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ +- struct swap_cluster_list free_clusters; /* free clusters list */ ++ struct list_head free_clusters; /* free clusters list */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +@@ -345,7 +338,7 @@ struct swap_info_struct { + * list. + */ + struct work_struct discard_work; /* discard worker */ +- struct swap_cluster_list discard_clusters; /* discard clusters list */ ++ struct list_head discard_clusters; /* discard clusters list */ + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 30832b85d6c2..76b344438606 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -289,62 +289,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, + #endif + #define LATENCY_LIMIT 256 + +-static inline void cluster_set_flag(struct swap_cluster_info *info, +- unsigned int flag) +-{ +- info->flags = flag; +-} +- +-static inline unsigned int cluster_count(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_count(struct swap_cluster_info *info, +- unsigned int c) +-{ +- info->data = c; +-} +- +-static inline void cluster_set_count_flag(struct swap_cluster_info *info, +- unsigned int c, unsigned int f) +-{ +- info->flags = f; +- info->data = c; +-} +- +-static inline unsigned int cluster_next(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_next(struct swap_cluster_info *info, +- unsigned int n) +-{ +- info->data = n; +-} +- +-static inline void cluster_set_next_flag(struct swap_cluster_info *info, +- unsigned int n, unsigned int f) +-{ +- info->flags = f; +- info->data = n; +-} +- + static inline bool cluster_is_free(struct swap_cluster_info *info) + { + return info->flags & CLUSTER_FLAG_FREE; + } + +-static inline bool cluster_is_null(struct swap_cluster_info *info) +-{ +- return info->flags & CLUSTER_FLAG_NEXT_NULL; +-} +- +-static inline void cluster_set_null(struct swap_cluster_info *info) ++static inline unsigned int cluster_index(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) + { +- info->flags = CLUSTER_FLAG_NEXT_NULL; +- info->data = 0; ++ return ci - si->cluster_info; + } + + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, +@@ -393,65 +346,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + spin_unlock(&si->lock); + } + +-static inline bool cluster_list_empty(struct swap_cluster_list *list) +-{ +- return cluster_is_null(&list->head); +-} +- +-static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +-{ +- return cluster_next(&list->head); +-} +- +-static void cluster_list_init(struct swap_cluster_list *list) +-{ +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +-} +- +-static void cluster_list_add_tail(struct swap_cluster_list *list, +- struct swap_cluster_info *ci, +- unsigned int idx) +-{ +- if (cluster_list_empty(list)) { +- cluster_set_next_flag(&list->head, idx, 0); +- cluster_set_next_flag(&list->tail, idx, 0); +- } else { +- struct swap_cluster_info *ci_tail; +- unsigned int tail = cluster_next(&list->tail); +- +- /* +- * Nested cluster lock, but both cluster locks are +- * only acquired when we held swap_info_struct->lock +- */ +- ci_tail = ci + tail; +- spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); +- cluster_set_next(ci_tail, idx); +- spin_unlock(&ci_tail->lock); +- cluster_set_next_flag(&list->tail, idx, 0); +- } +-} +- +-static unsigned int cluster_list_del_first(struct swap_cluster_list *list, +- struct swap_cluster_info *ci) +-{ +- unsigned int idx; +- +- idx = cluster_next(&list->head); +- if (cluster_next(&list->tail) == idx) { +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +- } else +- cluster_set_next_flag(&list->head, +- cluster_next(&ci[idx]), 0); +- +- return idx; +-} +- + /* Add a cluster to discard list and schedule it to do discard */ + static void swap_cluster_schedule_discard(struct swap_info_struct *si, +- unsigned int idx) ++ struct swap_cluster_info *ci) + { ++ unsigned int idx = cluster_index(si, ci); + /* + * If scan_swap_map_slots() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't +@@ -461,17 +360,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + +- cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); +- ++ list_add_tail(&ci->list, &si->discard_clusters); + schedule_work(&si->discard_work); + } + +-static void __free_cluster(struct swap_info_struct *si, unsigned long idx) ++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info; +- +- cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&si->free_clusters, ci, idx); ++ ci->flags = CLUSTER_FLAG_FREE; ++ list_add_tail(&ci->list, &si->free_clusters); + } + + /* +@@ -480,24 +376,25 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + static void swap_do_scheduled_discard(struct swap_info_struct *si) + { +- struct swap_cluster_info *info, *ci; ++ struct swap_cluster_info *ci; + unsigned int idx; + +- info = si->cluster_info; +- +- while (!cluster_list_empty(&si->discard_clusters)) { +- idx = cluster_list_del_first(&si->discard_clusters, info); ++ while (!list_empty(&si->discard_clusters)) { ++ ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); ++ list_del(&ci->list); ++ idx = cluster_index(si, ci); + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); +- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); +- __free_cluster(si, idx); ++ ++ spin_lock(&ci->lock); ++ __free_cluster(si, ci); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); +- unlock_cluster(ci); ++ spin_unlock(&ci->lock); + } + } + +@@ -520,20 +417,21 @@ static void swap_users_ref_free(struct percpu_ref *ref) + complete(&si->comp); + } + +-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) ++static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx) + { +- struct swap_cluster_info *ci = si->cluster_info; ++ struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, ++ struct swap_cluster_info, list); + +- VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); +- cluster_list_del_first(&si->free_clusters, ci); +- cluster_set_count_flag(ci + idx, 0, 0); ++ VM_BUG_ON(cluster_index(si, ci) != idx); ++ list_del(&ci->list); ++ ci->count = 0; ++ ci->flags = 0; ++ return ci; + } + +-static void free_cluster(struct swap_info_struct *si, unsigned long idx) ++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info + idx; +- +- VM_BUG_ON(cluster_count(ci) != 0); ++ VM_BUG_ON(ci->count != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -541,11 +439,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { +- swap_cluster_schedule_discard(si, idx); ++ swap_cluster_schedule_discard(si, ci); + return; + } + +- __free_cluster(si, idx); ++ __free_cluster(si, ci); + } + + /* +@@ -558,15 +456,15 @@ static void add_cluster_info_page(struct swap_info_struct *p, + unsigned long count) + { + unsigned long idx = page_nr / SWAPFILE_CLUSTER; ++ struct swap_cluster_info *ci = cluster_info + idx; + + if (!cluster_info) + return; +- if (cluster_is_free(&cluster_info[idx])) ++ if (cluster_is_free(ci)) + alloc_cluster(p, idx); + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) + count); ++ VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER); ++ ci->count += count; + } + + /* +@@ -580,24 +478,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p, + } + + /* +- * The cluster corresponding to page_nr decreases one usage. If the usage +- * counter becomes 0, which means no page in the cluster is in using, we can +- * optionally discard the cluster and add it to free cluster list. ++ * The cluster ci decreases one usage. If the usage counter becomes 0, ++ * which means no page in the cluster is in use, we can optionally discard ++ * the cluster and add it to free cluster list. + */ +-static void dec_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) ++static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) + { +- unsigned long idx = page_nr / SWAPFILE_CLUSTER; +- +- if (!cluster_info) ++ if (!p->cluster_info) + return; + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) - 1); ++ VM_BUG_ON(ci->count == 0); ++ ci->count--; + +- if (cluster_count(&cluster_info[idx]) == 0) +- free_cluster(p, idx); ++ if (!ci->count) ++ free_cluster(p, ci); + } + + /* +@@ -610,10 +504,12 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + { + struct percpu_cluster *percpu_cluster; + bool conflict; ++ struct swap_cluster_info *first = list_first_entry(&si->free_clusters, ++ struct swap_cluster_info, list); + + offset /= SWAPFILE_CLUSTER; +- conflict = !cluster_list_empty(&si->free_clusters) && +- offset != cluster_list_first(&si->free_clusters) && ++ conflict = !list_empty(&si->free_clusters) && ++ offset != cluster_index(si, first) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) +@@ -654,10 +550,10 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + cluster = this_cpu_ptr(si->percpu_cluster); + tmp = cluster->next[order]; + if (tmp == SWAP_NEXT_INVALID) { +- if (!cluster_list_empty(&si->free_clusters)) { +- tmp = cluster_next(&si->free_clusters.head) * +- SWAPFILE_CLUSTER; +- } else if (!cluster_list_empty(&si->discard_clusters)) { ++ if (!list_empty(&si->free_clusters)) { ++ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; ++ } else if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then +@@ -1055,8 +951,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); +- cluster_set_count_flag(ci, 0, 0); +- free_cluster(si, idx); ++ ci->count = 0; ++ ci->flags = 0; ++ free_cluster(si, ci); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); + } +@@ -1418,7 +1315,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; +- dec_cluster_info_page(p, p->cluster_info, offset); ++ dec_cluster_info_page(p, ci); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry, 1); +@@ -3113,8 +3010,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + + nr_good_pages = maxpages - 1; /* omit header page */ + +- cluster_list_init(&p->free_clusters); +- cluster_list_init(&p->discard_clusters); ++ INIT_LIST_HEAD(&p->free_clusters); ++ INIT_LIST_HEAD(&p->discard_clusters); + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; +@@ -3165,14 +3062,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { ++ struct swap_cluster_info *ci; + idx = i * SWAP_CLUSTER_COLS + j; ++ ci = cluster_info + idx; + if (idx >= nr_clusters) + continue; +- if (cluster_count(&cluster_info[idx])) ++ if (ci->count) + continue; +- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&p->free_clusters, cluster_info, +- idx); ++ ci->flags = CLUSTER_FLAG_FREE; ++ list_add_tail(&ci->list, &p->free_clusters); + } + } + return nr_extents; +-- +Gitee + + +From 3bc5a5e67c63e14fe1342ed16ecb304cf60d94b3 Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:08 +0800 +Subject: [PATCH 03/14] mm: swap: mTHP allocate swap entries from nonfull list +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mainline inclusion +from mainline-v6.12-rc1 +commit d07a46a4ac18786e7f4c98fb08525ed80dd1f642 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d07a46a4ac18786e7f4c98fb08525ed80dd1f642 + +-------------------------------- + +Track the nonfull cluster as well as the empty cluster on lists. Each +order has one nonfull cluster list. + +The cluster will remember which order it was used during new cluster +allocation. + +When the cluster has free entry, add to the nonfull[order] list.  When +the free cluster list is empty, also allocate from the nonempty list of +that order. + +This improves the mTHP swap allocation success rate. + +There are limitations if the distribution of numbers of different orders +of mTHP changes a lot. e.g. there are a lot of nonfull cluster assign to +order A while later time there are a lot of order B allocation while very +little allocation in order A. Currently the cluster used by order A will +not reused by order B unless the cluster is 100% empty. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kairui Song +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 4 ++++ + mm/swapfile.c | 38 +++++++++++++++++++++++++++++++++++--- + 2 files changed, 39 insertions(+), 3 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 94e1b6bb04ce..29a1daa46421 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -266,9 +266,11 @@ struct swap_cluster_info { + */ + u16 count; + u8 flags; ++ u8 order; + struct list_head list; + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ ++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -306,6 +308,8 @@ struct swap_info_struct { + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ ++ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that contains at least one free slot */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 76b344438606..adde6877c0fe 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -360,14 +360,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + +- list_add_tail(&ci->list, &si->discard_clusters); ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); ++ if (ci->flags & CLUSTER_FLAG_NONFULL) ++ list_move_tail(&ci->list, &si->discard_clusters); ++ else ++ list_add_tail(&ci->list, &si->discard_clusters); ++ ci->flags = 0; + schedule_work(&si->discard_work); + } + + static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { ++ if (ci->flags & CLUSTER_FLAG_NONFULL) ++ list_move_tail(&ci->list, &si->free_clusters); ++ else ++ list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; +- list_add_tail(&ci->list, &si->free_clusters); + } + + /* +@@ -490,8 +498,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + VM_BUG_ON(ci->count == 0); + ci->count--; + +- if (!ci->count) ++ if (!ci->count) { + free_cluster(p, ci); ++ return; ++ } ++ ++ if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { ++ list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ ci->flags |= CLUSTER_FLAG_NONFULL; ++ } + } + + /* +@@ -552,6 +567,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + if (tmp == SWAP_NEXT_INVALID) { + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ list_del(&ci->list); ++ spin_lock(&ci->lock); ++ ci->order = order; ++ ci->flags = 0; ++ spin_unlock(&ci->lock); ++ tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; ++ } else if (!list_empty(&si->nonfull_clusters[order])) { ++ ci = list_first_entry(&si->nonfull_clusters[order], ++ struct swap_cluster_info, list); ++ list_del(&ci->list); ++ spin_lock(&ci->lock); ++ ci->flags = 0; ++ spin_unlock(&ci->lock); + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; + } else if (!list_empty(&si->discard_clusters)) { + /* +@@ -952,6 +980,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + ci->count = 0; ++ ci->order = 0; + ci->flags = 0; + free_cluster(si, ci); + unlock_cluster(ci); +@@ -3013,6 +3042,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + INIT_LIST_HEAD(&p->free_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + ++ for (i = 0; i < SWAP_NR_ORDERS; i++) ++ INIT_LIST_HEAD(&p->nonfull_clusters[i]); ++ + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) +-- +Gitee + + +From 71c1b6bdf4681e292a269a16337b6fbf64c388d6 Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:09 +0800 +Subject: [PATCH 04/14] mm: swap: separate SSD allocation from + scan_swap_map_slots() + +mainline inclusion +from mainline-v6.12-rc1 +commit 5f843a9a3a1e865fbf349419bde39977c2e7d3d1 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f843a9a3a1e865fbf349419bde39977c2e7d3d1 + +-------------------------------- + +Previously the SSD and HDD share the same swap_map scan loop in +scan_swap_map_slots(). This function is complex and hard to flow the +execution flow. + +scan_swap_map_try_ssd_cluster() can already do most of the heavy lifting +to locate the candidate swap range in the cluster. However it needs to go +back to scan_swap_map_slots() to check conflict and then perform the +allocation. + +When scan_swap_map_try_ssd_cluster() failed, it still depended on the +scan_swap_map_slots() to do brute force scanning of the swap_map. When +the swapfile is large and almost full, it will take some CPU time to go +through the swap_map array. + +Get rid of the cluster allocation dependency on the swap_map scan loop in +scan_swap_map_slots(). Streamline the cluster allocation code path. No +more conflict checks. + +For order 0 swap entry, when run out of free and nonfull list. It will +allocate from the higher order nonfull cluster list. + +Users should see less CPU time spent on searching the free swap slot when +swapfile is almost full. + +[ryncsn@gmail.com: fix array-bounds error with CONFIG_THP_SWAP=n] + Link: https://lkml.kernel.org/r/CAMgjq7Bz0DY+rY0XgCoH7-Q=uHLdo3omi8kUr4ePDweNyofsbQ@mail.gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-3-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 300 ++++++++++++++++++++++++++++---------------------- + 1 file changed, 168 insertions(+), 132 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index adde6877c0fe..a3e721510311 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -52,6 +52,8 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); ++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, ++ unsigned int nr_entries); + + static DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -300,6 +302,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, + return ci - si->cluster_info; + } + ++static inline unsigned int cluster_offset(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) ++{ ++ return cluster_index(si, ci) * SWAPFILE_CLUSTER; ++} ++ + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) + { +@@ -371,11 +379,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + + static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); ++ + if (ci->flags & CLUSTER_FLAG_NONFULL) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; ++ ci->order = 0; + } + + /* +@@ -430,9 +442,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi + struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, + struct swap_cluster_info, list); + ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + VM_BUG_ON(cluster_index(si, ci) != idx); ++ VM_BUG_ON(ci->count); + list_del(&ci->list); +- ci->count = 0; + ci->flags = 0; + return ci; + } +@@ -440,6 +454,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi + static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { + VM_BUG_ON(ci->count != 0); ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -496,6 +512,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + return; + + VM_BUG_ON(ci->count == 0); ++ VM_BUG_ON(cluster_is_free(ci)); ++ lockdep_assert_held(&p->lock); ++ lockdep_assert_held(&ci->lock); + ci->count--; + + if (!ci->count) { +@@ -504,48 +523,88 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + } + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- ci->flags |= CLUSTER_FLAG_NONFULL; ++ ci->flags = CLUSTER_FLAG_NONFULL; + } + } + +-/* +- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free +- * cluster list. Avoiding such abuse to avoid list corruption. +- */ +-static bool +-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, +- unsigned long offset, int order) ++static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, ++ unsigned int nr_pages) + { +- struct percpu_cluster *percpu_cluster; +- bool conflict; +- struct swap_cluster_info *first = list_first_entry(&si->free_clusters, +- struct swap_cluster_info, list); +- +- offset /= SWAPFILE_CLUSTER; +- conflict = !list_empty(&si->free_clusters) && +- offset != cluster_index(si, first) && +- cluster_is_free(&si->cluster_info[offset]); ++ unsigned char *p = si->swap_map + start; ++ unsigned char *end = p + nr_pages; + +- if (!conflict) +- return false; ++ while (p < end) ++ if (*p++) ++ return false; + +- percpu_cluster = this_cpu_ptr(si->percpu_cluster); +- percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; + } + +-static inline bool swap_range_empty(char *swap_map, unsigned int start, +- unsigned int nr_pages) ++ ++static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++ unsigned int start, unsigned char usage, ++ unsigned int order) + { +- unsigned int i; ++ unsigned int nr_pages = 1 << order; + +- for (i = 0; i < nr_pages; i++) { +- if (swap_map[start + i]) +- return false; ++ if (cluster_is_free(ci)) { ++ if (nr_pages < SWAPFILE_CLUSTER) { ++ list_move_tail(&ci->list, &si->nonfull_clusters[order]); ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ } ++ ci->order = order; + } + +- return true; ++ memset(si->swap_map + start, usage, nr_pages); ++ swap_range_alloc(si, start, nr_pages); ++ ci->count += nr_pages; ++ ++ if (ci->count == SWAPFILE_CLUSTER) { ++ VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL))); ++ list_del(&ci->list); ++ ci->flags = 0; ++ } ++} ++ ++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, ++ unsigned int *foundp, unsigned int order, ++ unsigned char usage) ++{ ++ unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); ++ unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); ++ unsigned int nr_pages = 1 << order; ++ struct swap_cluster_info *ci; ++ ++ if (end < nr_pages) ++ return SWAP_NEXT_INVALID; ++ end -= nr_pages; ++ ++ ci = lock_cluster(si, offset); ++ if (ci->count + nr_pages > SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ ++ while (offset <= end) { ++ if (cluster_scan_range(si, offset, nr_pages)) { ++ cluster_alloc_range(si, ci, offset, usage, order); ++ *foundp = offset; ++ if (ci->count == SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ offset += nr_pages; ++ break; ++ } ++ offset += nr_pages; ++ } ++ if (offset > end) ++ offset = SWAP_NEXT_INVALID; ++done: ++ unlock_cluster(ci); ++ return offset; + } + + /* +@@ -553,72 +612,66 @@ static inline bool swap_range_empty(char *swap_map, unsigned int start, + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. + */ +-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +- unsigned long *offset, unsigned long *scan_base, int order) ++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, ++ unsigned char usage) + { +- unsigned int nr_pages = 1 << order; + struct percpu_cluster *cluster; +- struct swap_cluster_info *ci; +- unsigned int tmp, max; ++ struct swap_cluster_info *ci, *n; ++ unsigned int offset, found = 0; + + new_cluster: ++ lockdep_assert_held(&si->lock); + cluster = this_cpu_ptr(si->percpu_cluster); +- tmp = cluster->next[order]; +- if (tmp == SWAP_NEXT_INVALID) { +- if (!list_empty(&si->free_clusters)) { +- ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); +- list_del(&ci->list); +- spin_lock(&ci->lock); +- ci->order = order; +- ci->flags = 0; +- spin_unlock(&ci->lock); +- tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; +- } else if (!list_empty(&si->nonfull_clusters[order])) { +- ci = list_first_entry(&si->nonfull_clusters[order], +- struct swap_cluster_info, list); +- list_del(&ci->list); +- spin_lock(&ci->lock); +- ci->flags = 0; +- spin_unlock(&ci->lock); +- tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; +- } else if (!list_empty(&si->discard_clusters)) { +- /* +- * we don't have free cluster but have some clusters in +- * discarding, do discard now and reclaim them, then +- * reread cluster_next_cpu since we dropped si->lock +- */ +- swap_do_scheduled_discard(si); +- *scan_base = this_cpu_read(*si->cluster_next_cpu); +- *offset = *scan_base; +- goto new_cluster; +- } else +- return false; ++ offset = cluster->next[order]; ++ if (offset) { ++ offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); ++ if (found) ++ goto done; + } + +- /* +- * Other CPUs can use our cluster if they can't find a free cluster, +- * check if there is still free entry in the cluster, maintaining +- * natural alignment. +- */ +- max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); +- if (tmp < max) { +- ci = lock_cluster(si, tmp); +- while (tmp < max) { +- if (swap_range_empty(si->swap_map, tmp, nr_pages)) +- break; +- tmp += nr_pages; ++ if (!list_empty(&si->free_clusters)) { ++ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ ++ if (order < PMD_ORDER) { ++ list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ if (found) ++ goto done; + } +- unlock_cluster(ci); + } +- if (tmp >= max) { +- cluster->next[order] = SWAP_NEXT_INVALID; ++ ++ if (!list_empty(&si->discard_clusters)) { ++ /* ++ * we don't have free cluster but have some clusters in ++ * discarding, do discard now and reclaim them, then ++ * reread cluster_next_cpu since we dropped si->lock ++ */ ++ swap_do_scheduled_discard(si); + goto new_cluster; + } +- *offset = tmp; +- *scan_base = tmp; +- tmp += nr_pages; +- cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; +- return true; ++ ++ if (order) ++ goto done; ++ ++ for (int o = 1; o < SWAP_NR_ORDERS; o++) { ++ if (!list_empty(&si->nonfull_clusters[o])) { ++ ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, ++ list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ } ++ ++done: ++ cluster->next[order] = offset; ++ return found; + } + + static void __del_from_avail_list(struct swap_info_struct *p) +@@ -739,11 +792,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, + return false; + } + ++static int cluster_alloc_swap(struct swap_info_struct *si, ++ unsigned char usage, int nr, ++ swp_entry_t slots[], int order) ++{ ++ int n_ret = 0; ++ ++ VM_BUG_ON(!si->cluster_info); ++ ++ while (n_ret < nr) { ++ unsigned long offset = cluster_alloc_swap_entry(si, order, usage); ++ ++ if (!offset) ++ break; ++ slots[n_ret++] = swp_entry(si->type, offset); ++ } ++ ++ return n_ret; ++} ++ + static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) + { +- struct swap_cluster_info *ci; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; +@@ -782,26 +853,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return 0; + } + ++ if (si->cluster_info) ++ return cluster_alloc_swap(si, usage, nr, slots, order); ++ + si->flags += SWP_SCANNING; +- /* +- * Use percpu scan base for SSD to reduce lock contention on +- * cluster and swap cache. For HDD, sequential access is more +- * important. +- */ +- if (si->flags & SWP_SOLIDSTATE) +- scan_base = this_cpu_read(*si->cluster_next_cpu); +- else +- scan_base = si->cluster_next; ++ ++ /* For HDD, sequential access is more important. */ ++ scan_base = si->cluster_next; + offset = scan_base; + +- /* SSD algorithm */ +- if (si->cluster_info) { +- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } else if (unlikely(!si->cluster_nr--)) { ++ if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; +@@ -812,8 +873,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. +- * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info +- * case, just handled by scan_swap_map_try_ssd_cluster() above. + */ + scan_base = offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; +@@ -841,19 +900,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + checks: +- if (si->cluster_info) { +- while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { +- /* take a break if we already got some slots */ +- if (n_ret) +- goto done; +- if (!scan_swap_map_try_ssd_cluster(si, &offset, +- &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } +- } + if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) +@@ -861,11 +907,9 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + +- ci = lock_cluster(si, offset); + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; +- unlock_cluster(ci); + spin_unlock(&si->lock); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + spin_lock(&si->lock); +@@ -876,15 +920,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + if (si->swap_map[offset]) { +- unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } + memset(si->swap_map + offset, usage, nr_pages); +- add_cluster_info_page(si, si->cluster_info, offset, nr_pages); +- unlock_cluster(ci); + + swap_range_alloc(si, offset, nr_pages); + slots[n_ret++] = swp_entry(si->type, offset); +@@ -905,13 +946,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + latency_ration = LATENCY_LIMIT; + } + +- /* try to get more slots in cluster */ +- if (si->cluster_info) { +- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) +- goto checks; +- if (order > 0) +- goto done; +- } else if (si->cluster_nr && !si->swap_map[++offset]) { ++ if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ + --si->cluster_nr; + goto checks; +@@ -980,8 +1015,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + ci->count = 0; +- ci->order = 0; +- ci->flags = 0; + free_cluster(si, ci); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +@@ -3099,8 +3132,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + ci = cluster_info + idx; + if (idx >= nr_clusters) + continue; +- if (ci->count) ++ if (ci->count) { ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ list_add_tail(&ci->list, &p->nonfull_clusters[0]); + continue; ++ } + ci->flags = CLUSTER_FLAG_FREE; + list_add_tail(&ci->list, &p->free_clusters); + } +-- +Gitee + + +From 4db67dafd426f7dd2fbde13583c1875a2b242b95 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:10 +0800 +Subject: [PATCH 05/14] mm: swap: clean up initialization helper + +mainline inclusion +from mainline-v6.12-rc1 +commit 3b2561b5daeb3531c011491e9a6d2b934cc8f49f +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b2561b5daeb3531c011491e9a6d2b934cc8f49f + +-------------------------------- + +At this point, alloc_cluster is never called already, and +inc_cluster_info_page is called by initialization only, a lot of dead code +can be dropped. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-4-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 44 ++++++++++---------------------------------- + 1 file changed, 10 insertions(+), 34 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index a3e721510311..4be5fbbdc1c8 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -437,20 +437,6 @@ static void swap_users_ref_free(struct percpu_ref *ref) + complete(&si->comp); + } + +-static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, +- struct swap_cluster_info, list); +- +- lockdep_assert_held(&si->lock); +- lockdep_assert_held(&ci->lock); +- VM_BUG_ON(cluster_index(si, ci) != idx); +- VM_BUG_ON(ci->count); +- list_del(&ci->list); +- ci->flags = 0; +- return ci; +-} +- + static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { + VM_BUG_ON(ci->count != 0); +@@ -471,34 +457,24 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * + } + + /* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by +- * count. ++ * The cluster corresponding to page_nr will be used. The cluster will not be ++ * added to free cluster list and its usage counter will be increased by 1. ++ * Only used for initialization. + */ +-static void add_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr, +- unsigned long count) ++static void inc_cluster_info_page(struct swap_info_struct *p, ++ struct swap_cluster_info *cluster_info, unsigned long page_nr) + { + unsigned long idx = page_nr / SWAPFILE_CLUSTER; +- struct swap_cluster_info *ci = cluster_info + idx; ++ struct swap_cluster_info *ci; + + if (!cluster_info) + return; +- if (cluster_is_free(ci)) +- alloc_cluster(p, idx); + +- VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER); +- ci->count += count; +-} ++ ci = cluster_info + idx; ++ ci->count++; + +-/* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by 1. +- */ +-static void inc_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) +-{ +- add_cluster_info_page(p, cluster_info, page_nr, 1); ++ VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); ++ VM_BUG_ON(ci->flags); + } + + /* +-- +Gitee + + +From 18f732c19747e766e0632419f32dfb02768ada67 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:11 +0800 +Subject: [PATCH 06/14] mm: swap: skip slot cache on freeing for mTHP + +mainline inclusion +from mainline-v6.12-rc1 +commit 650975d2b181e30c9017c42cb3f6535287555b1e +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=650975d2b181e30c9017c42cb3f6535287555b1e + +-------------------------------- + +Currently when we are freeing mTHP folios from swap cache, we free then +one by one and put each entry into swap slot cache. Slot cache is +designed to reduce the overhead by batching the freeing, but mTHP swap +entries are already continuous so they can be batch freed without it +already, it saves litle overhead, or even increase overhead for larger +mTHP. + +What's more, mTHP entries could stay in swap cache for a while. +Contiguous swap entry is an rather rare resource so releasing them +directly can help improve mTHP allocation success rate when under +pressure. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Acked-by: Barry Song +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 59 +++++++++++++++++++++++---------------------------- + 1 file changed, 26 insertions(+), 33 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 4be5fbbdc1c8..44726e0b8f8f 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -478,20 +478,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p, + } + + /* +- * The cluster ci decreases one usage. If the usage counter becomes 0, ++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, + * which means no page in the cluster is in use, we can optionally discard + * the cluster and add it to free cluster list. + */ +-static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) ++static void dec_cluster_info_page(struct swap_info_struct *p, ++ struct swap_cluster_info *ci, int nr_pages) + { + if (!p->cluster_info) + return; + +- VM_BUG_ON(ci->count == 0); ++ VM_BUG_ON(ci->count < nr_pages); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&p->lock); + lockdep_assert_held(&ci->lock); +- ci->count--; ++ ci->count -= nr_pages; + + if (!ci->count) { + free_cluster(p, ci); +@@ -983,19 +984,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return n_ret; + } + +-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- unsigned long offset = idx * SWAPFILE_CLUSTER; +- struct swap_cluster_info *ci; +- +- ci = lock_cluster(si, offset); +- memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); +- ci->count = 0; +- free_cluster(si, ci); +- unlock_cluster(ci); +- swap_range_free(si, offset, SWAPFILE_CLUSTER); +-} +- + #ifdef CONFIG_MEMCG_SWAP_QOS + int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) + { +@@ -1343,21 +1331,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, + return usage; + } + +-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) ++/* ++ * Drop the last HAS_CACHE flag of swap entries, caller have to ++ * ensure all entries belong to the same cgroup. ++ */ ++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, ++ unsigned int nr_pages) + { +- struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); +- unsigned char count; ++ unsigned char *map = p->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ struct swap_cluster_info *ci; + + ci = lock_cluster(p, offset); +- count = p->swap_map[offset]; +- VM_BUG_ON(count != SWAP_HAS_CACHE); +- p->swap_map[offset] = 0; +- dec_cluster_info_page(p, ci); ++ do { ++ VM_BUG_ON(*map != SWAP_HAS_CACHE); ++ *map = 0; ++ } while (++map < map_end); ++ dec_cluster_info_page(p, ci, nr_pages); + unlock_cluster(ci); + +- mem_cgroup_uncharge_swap(entry, 1); +- swap_range_free(p, offset, 1); ++ mem_cgroup_uncharge_swap(entry, nr_pages); ++ swap_range_free(p, offset, nr_pages); + } + + static void cluster_swap_free_nr(struct swap_info_struct *sis, +@@ -1418,7 +1413,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) + void put_swap_folio(struct folio *folio, swp_entry_t entry) + { + unsigned long offset = swp_offset(entry); +- unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; +@@ -1431,19 +1425,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + return; + + ci = lock_cluster_or_swap_info(si, offset); +- if (size == SWAPFILE_CLUSTER) { ++ if (size > 1) { + map = si->swap_map + offset; +- for (i = 0; i < SWAPFILE_CLUSTER; i++) { ++ for (i = 0; i < size; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } +- if (free_entries == SWAPFILE_CLUSTER) { ++ if (free_entries == size) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); +- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); +- swap_free_cluster(si, idx); ++ swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; + } +@@ -1488,7 +1481,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) +- swap_entry_free(p, entries[i]); ++ swap_entry_range_free(p, entries[i], 1); + prev = p; + } + if (p) +-- +Gitee + + +From 53a99352d0946625a0d45deeb8d0729855d4b080 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:12 +0800 +Subject: [PATCH 07/14] mm: swap: allow cache reclaim to skip slot cache + +mainline inclusion +from mainline-v6.12-rc1 +commit 862590ac3708e1cbbfb02a8ed78587b86ecba4ba +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=862590ac3708e1cbbfb02a8ed78587b86ecba4ba + +-------------------------------- + +Currently we free the reclaimed slots through slot cache even if the slot +is required to be empty immediately. As a result the reclaim caller will +see the slot still occupied even after a successful reclaim, and need to +keep reclaiming until slot cache get flushed. This caused ineffective or +over reclaim when SWAP is under stress. + +So introduce a new flag allowing the slot to be emptied bypassing the slot +cache. + +[21cnbao@gmail.com: small folios should have nr_pages == 1 but not nr_page == 0] + Link: https://lkml.kernel.org/r/20240805015324.45134-1-21cnbao@gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 152 ++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 109 insertions(+), 43 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 44726e0b8f8f..e58457b801fb 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -52,8 +52,15 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); ++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, ++ unsigned int nr_pages); + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); ++static bool folio_swapcache_freeable(struct folio *folio); ++static struct swap_cluster_info *lock_cluster_or_swap_info( ++ struct swap_info_struct *si, unsigned long offset); ++static void unlock_cluster_or_swap_info(struct swap_info_struct *si, ++ struct swap_cluster_info *ci); + + static DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -128,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent) + * corresponding page + */ + #define TTRS_UNMAPPED 0x2 +-/* Reclaim the swap entry if swap is getting full*/ ++/* Reclaim the swap entry if swap is getting full */ + #define TTRS_FULL 0x4 ++/* Reclaim directly, bypass the slot cache and don't touch device lock */ ++#define TTRS_DIRECT 0x8 ++ ++static bool swap_is_has_cache(struct swap_info_struct *si, ++ unsigned long offset, int nr_pages) ++{ ++ unsigned char *map = si->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ ++ do { ++ VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); ++ if (*map != SWAP_HAS_CACHE) ++ return false; ++ } while (++map < map_end); ++ ++ return true; ++} + + /* + * returns number of pages in the folio that backs the swap entry. If positive, +@@ -140,12 +164,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) + { + swp_entry_t entry = swp_entry(si->type, offset); ++ struct address_space *address_space = swap_address_space(entry); ++ struct swap_cluster_info *ci; + struct folio *folio; +- int ret = 0; ++ int ret, nr_pages; ++ bool need_reclaim; + +- folio = filemap_get_folio(swap_address_space(entry), offset); ++ folio = filemap_get_folio(address_space, offset); + if (IS_ERR(folio)) + return 0; ++ ++ /* offset could point to the middle of a large folio */ ++ entry = folio->swap; ++ offset = swp_offset(entry); ++ nr_pages = folio_nr_pages(folio); ++ ret = -nr_pages; ++ + /* + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming folios. So we hold a folio lock +@@ -153,14 +187,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + * case and you should use folio_free_swap() with explicit folio_lock() + * in usual operations. + */ +- if (folio_trylock(folio)) { +- if ((flags & TTRS_ANYWAY) || +- ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || +- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) +- ret = folio_free_swap(folio); +- folio_unlock(folio); ++ if (!folio_trylock(folio)) ++ goto out; ++ ++ need_reclaim = ((flags & TTRS_ANYWAY) || ++ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ++ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); ++ if (!need_reclaim || !folio_swapcache_freeable(folio)) ++ goto out_unlock; ++ ++ /* ++ * It's safe to delete the folio from swap cache only if the folio's ++ * swap_map is HAS_CACHE only, which means the slots have no page table ++ * reference or pending writeback, and can't be allocated to others. ++ */ ++ ci = lock_cluster_or_swap_info(si, offset); ++ need_reclaim = swap_is_has_cache(si, offset, nr_pages); ++ unlock_cluster_or_swap_info(si, ci); ++ if (!need_reclaim) ++ goto out_unlock; ++ ++ if (!(flags & TTRS_DIRECT)) { ++ /* Free through slot cache */ ++ delete_from_swap_cache(folio); ++ folio_set_dirty(folio); ++ ret = nr_pages; ++ goto out_unlock; + } +- ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); ++ ++ xa_lock_irq(&address_space->i_pages); ++ __delete_from_swap_cache(folio, entry, NULL); ++ xa_unlock_irq(&address_space->i_pages); ++ folio_ref_sub(folio, nr_pages); ++ folio_set_dirty(folio); ++ ++ spin_lock(&si->lock); ++ /* Only sinple page folio can be backed by zswap */ ++ if (nr_pages == 1) ++ zswap_invalidate(entry); ++ swap_entry_range_free(si, entry, nr_pages); ++ spin_unlock(&si->lock); ++ ret = nr_pages; ++out_unlock: ++ folio_unlock(folio); ++out: + folio_put(folio); + return ret; + } +@@ -888,7 +958,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&si->lock); +- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); ++ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&si->lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed > 0) +@@ -1415,9 +1485,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + unsigned long offset = swp_offset(entry); + struct swap_cluster_info *ci; + struct swap_info_struct *si; +- unsigned char *map; +- unsigned int i, free_entries = 0; +- unsigned char val; + int size = 1 << swap_entry_order(folio_order(folio)); + + si = _swap_info_get(entry); +@@ -1425,23 +1492,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + return; + + ci = lock_cluster_or_swap_info(si, offset); +- if (size > 1) { +- map = si->swap_map + offset; +- for (i = 0; i < size; i++) { +- val = map[i]; +- VM_BUG_ON(!(val & SWAP_HAS_CACHE)); +- if (val == SWAP_HAS_CACHE) +- free_entries++; +- } +- if (free_entries == size) { +- unlock_cluster_or_swap_info(si, ci); +- spin_lock(&si->lock); +- swap_entry_range_free(si, entry, size); +- spin_unlock(&si->lock); +- return; +- } ++ if (size > 1 && swap_is_has_cache(si, offset, size)) { ++ unlock_cluster_or_swap_info(si, ci); ++ spin_lock(&si->lock); ++ swap_entry_range_free(si, entry, size); ++ spin_unlock(&si->lock); ++ return; + } +- for (i = 0; i < size; i++, entry.val++) { ++ for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); +@@ -1601,16 +1659,7 @@ static bool folio_swapped(struct folio *folio) + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); + } + +-/** +- * folio_free_swap() - Free the swap space used for this folio. +- * @folio: The folio to remove. +- * +- * If swap is getting full, or if there are no more mappings of this folio, +- * then call folio_free_swap to free its swap space. +- * +- * Return: true if we were able to release the swap space. +- */ +-bool folio_free_swap(struct folio *folio) ++static bool folio_swapcache_freeable(struct folio *folio) + { + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + +@@ -1618,8 +1667,6 @@ bool folio_free_swap(struct folio *folio) + return false; + if (folio_test_writeback(folio)) + return false; +- if (folio_swapped(folio)) +- return false; + + /* + * Once hibernation has begun to create its image of memory, +@@ -1639,6 +1686,25 @@ bool folio_free_swap(struct folio *folio) + if (pm_suspended_storage()) + return false; + ++ return true; ++} ++ ++/** ++ * folio_free_swap() - Free the swap space used for this folio. ++ * @folio: The folio to remove. ++ * ++ * If swap is getting full, or if there are no more mappings of this folio, ++ * then call folio_free_swap to free its swap space. ++ * ++ * Return: true if we were able to release the swap space. ++ */ ++bool folio_free_swap(struct folio *folio) ++{ ++ if (!folio_swapcache_freeable(folio)) ++ return false; ++ if (folio_swapped(folio)) ++ return false; ++ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + return true; +@@ -1715,7 +1781,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, +- TTRS_UNMAPPED | TTRS_FULL); ++ TTRS_UNMAPPED | TTRS_FULL); + if (nr == 0) + nr = 1; + else if (nr < 0) +-- +Gitee + + +From a1f6274ecbb551837ea7a66e740c660f405a2443 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:13 +0800 +Subject: [PATCH 08/14] mm: swap: add a fragment cluster list + +mainline inclusion +from mainline-v6.12-rc1 +commit 477cb7ba28892eda112c79d8f75d10edabfc3050 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=477cb7ba28892eda112c79d8f75d10edabfc3050 + +-------------------------------- + +Now swap cluster allocator arranges the clusters in LRU style, so the +"cold" cluster stay at the head of nonfull lists are the ones that were +used for allocation long time ago and still partially occupied. So if +allocator can't find enough contiguous slots to satisfy an high order +allocation, it's unlikely there will be slot being free on them to satisfy +the allocation, at least in a short period. + +As a result, nonfull cluster scanning will waste time repeatly scanning +the unusable head of the list. + +Also, multiple CPUs could content on the same head cluster of nonfull +list. Unlike free clusters which are removed from the list when any CPU +starts using it, nonfull cluster stays on the head. + +So introduce a new list frag list, all scanned nonfull clusters will be +moved to this list. Both for avoiding repeated scanning and contention. + +Frag list is still used as fallback for allocations, so if one CPU failed +to allocate one order of slots, it can still steal other CPU's clusters. +And order 0 will favor the fragmented clusters to better protect nonfull +clusters + +If any slots on a fragment list are being freed, move the fragment list +back to nonfull list indicating it worth another scan on the cluster. +Compared to scan upon freeing a slot, this keep the scanning lazy and save +some CPU if there are still other clusters to use. + +It may seems unneccessay to keep the fragmented cluster on list at all if +they can't be used for specific order allocation. But this will start to +make sense once reclaim dring scanning is ready. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 3 +++ + mm/swapfile.c | 41 +++++++++++++++++++++++++++++++++++++---- + 2 files changed, 40 insertions(+), 4 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 29a1daa46421..81188caed2d2 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -271,6 +271,7 @@ struct swap_cluster_info { + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ + #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -310,6 +311,8 @@ struct swap_info_struct { + struct list_head free_clusters; /* free clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ ++ struct list_head frag_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that are fragmented or contented */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index e58457b801fb..7c71e7df9cf3 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -571,7 +571,10 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ else ++ list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + } +@@ -609,7 +612,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { +- VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL))); ++ VM_BUG_ON(!(ci->flags & ++ (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + list_del(&ci->list); + ci->flags = 0; + } +@@ -665,6 +669,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + struct percpu_cluster *cluster; + struct swap_cluster_info *ci, *n; + unsigned int offset, found = 0; ++ LIST_HEAD(fraged); + + new_cluster: + lockdep_assert_held(&si->lock); +@@ -685,13 +690,29 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + + if (order < PMD_ORDER) { + list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { ++ list_move_tail(&ci->list, &fraged); ++ ci->flags = CLUSTER_FLAG_FRAG; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + if (found) +- goto done; ++ break; + } ++ ++ if (!found) { ++ list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ if (found) ++ break; ++ } ++ } ++ ++ list_splice_tail(&fraged, &si->frag_clusters[order]); + } + ++ if (found) ++ goto done; ++ + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in +@@ -705,7 +726,17 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + if (order) + goto done; + ++ /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { ++ if (!list_empty(&si->frag_clusters[o])) { ++ ci = list_first_entry(&si->frag_clusters[o], ++ struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, ++ 0, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ + if (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, + list); +@@ -3110,8 +3141,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + INIT_LIST_HEAD(&p->free_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + +- for (i = 0; i < SWAP_NR_ORDERS; i++) ++ for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&p->nonfull_clusters[i]); ++ INIT_LIST_HEAD(&p->frag_clusters[i]); ++ } + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; +-- +Gitee + + +From 7c0f2c55f9a21373319df1952070b162b3c6be8a Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:14 +0800 +Subject: [PATCH 09/14] mm: swap: relaim the cached parts that got scanned + +mainline inclusion +from mainline-v6.12-rc1 +commit 661383c6111a38c88df61af6bfbcfacd2ff20a67 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=661383c6111a38c88df61af6bfbcfacd2ff20a67 + +-------------------------------- + +This commit implements reclaim during scan for cluster allocator. + +Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could +result in low allocation success rate or early OOM. + +So to ensure maximum allocation success rate, integrate reclaiming with +scanning. If found a range of suitable swap slots but fragmented due to +HAS_CACHE, just try to reclaim the slots. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 1 + + mm/swapfile.c | 140 +++++++++++++++++++++++++++++++++---------- + 2 files changed, 110 insertions(+), 31 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 81188caed2d2..83b1bcbaf2ec 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -313,6 +313,7 @@ struct swap_info_struct { + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ ++ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 7c71e7df9cf3..45f73b73a92f 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -512,6 +512,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * + VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); ++ ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; ++ + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -571,31 +575,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_FRAG) ++ if (ci->flags & CLUSTER_FLAG_FRAG) { ++ p->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- else ++ } else { + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ } + ci->flags = CLUSTER_FLAG_NONFULL; + } + } + +-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, +- unsigned int nr_pages) ++static bool cluster_reclaim_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned long end) + { +- unsigned char *p = si->swap_map + start; +- unsigned char *end = p + nr_pages; ++ unsigned char *map = si->swap_map; ++ unsigned long offset; ++ ++ spin_unlock(&ci->lock); ++ spin_unlock(&si->lock); ++ ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) ++ continue; ++ goto out; ++ default: ++ goto out; ++ } ++ } ++out: ++ spin_lock(&si->lock); ++ spin_lock(&ci->lock); + +- while (p < end) +- if (*p++) ++ /* ++ * Recheck the range no matter reclaim succeeded or not, the slot ++ * could have been be freed while we are not holding the lock. ++ */ ++ for (offset = start; offset < end; offset++) ++ if (READ_ONCE(map[offset])) + return false; + + return true; + } + ++static bool cluster_scan_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned int nr_pages) ++{ ++ unsigned long offset, end = start + nr_pages; ++ unsigned char *map = si->swap_map; ++ bool need_reclaim = false; + +-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, +- unsigned int start, unsigned char usage, +- unsigned int order) ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (!vm_swap_full()) ++ return false; ++ need_reclaim = true; ++ continue; ++ default: ++ return false; ++ } ++ } ++ ++ if (need_reclaim) ++ return cluster_reclaim_range(si, ci, start, end); ++ ++ return true; ++} ++ ++static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++ unsigned int start, unsigned char usage, ++ unsigned int order) + { + unsigned int nr_pages = 1 << order; + +@@ -614,6 +671,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; + list_del(&ci->list); + ci->flags = 0; + } +@@ -639,7 +698,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + } + + while (offset <= end) { +- if (cluster_scan_range(si, offset, nr_pages)) { ++ if (cluster_scan_range(si, ci, offset, nr_pages)) { + cluster_alloc_range(si, ci, offset, usage, order); + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { +@@ -667,9 +726,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + unsigned char usage) + { + struct percpu_cluster *cluster; +- struct swap_cluster_info *ci, *n; ++ struct swap_cluster_info *ci; + unsigned int offset, found = 0; +- LIST_HEAD(fraged); + + new_cluster: + lockdep_assert_held(&si->lock); +@@ -689,25 +747,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + } + + if (order < PMD_ORDER) { +- list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { +- list_move_tail(&ci->list, &fraged); ++ unsigned int frags = 0; ++ ++ while (!list_empty(&si->nonfull_clusters[order])) { ++ ci = list_first_entry(&si->nonfull_clusters[order], ++ struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->frag_clusters[order]); + ci->flags = CLUSTER_FLAG_FRAG; ++ si->frag_cluster_nr[order]++; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); ++ frags++; + if (found) + break; + } + + if (!found) { +- list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { ++ /* ++ * Nonfull clusters are moved to frag tail if we reached ++ * here, count them too, don't over scan the frag list. ++ */ ++ while (frags < si->frag_cluster_nr[order]) { ++ ci = list_first_entry(&si->frag_clusters[order], ++ struct swap_cluster_info, list); ++ /* ++ * Rotate the frag list to iterate, they were all failing ++ * high order allocation or moved here due to per-CPU usage, ++ * this help keeping usable cluster ahead. ++ */ ++ list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); ++ frags++; + if (found) + break; + } + } +- +- list_splice_tail(&fraged, &si->frag_clusters[order]); + } + + if (found) +@@ -728,25 +803,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + + /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { +- if (!list_empty(&si->frag_clusters[o])) { ++ /* ++ * Clusters here have at least one usable slots and can't fail order 0 ++ * allocation, but reclaim may drop si->lock and race with another user. ++ */ ++ while (!list_empty(&si->frag_clusters[o])) { + ci = list_first_entry(&si->frag_clusters[o], + struct swap_cluster_info, list); +- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, +- 0, usage); +- VM_BUG_ON(!found); +- goto done; ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ if (found) ++ goto done; + } + +- if (!list_empty(&si->nonfull_clusters[o])) { +- ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, +- list); ++ while (!list_empty(&si->nonfull_clusters[o])) { ++ ci = list_first_entry(&si->nonfull_clusters[o], ++ struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); +- VM_BUG_ON(!found); +- goto done; ++ if (found) ++ goto done; + } + } +- + done: + cluster->next[order] = offset; + return found; +@@ -3144,6 +3222,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&p->nonfull_clusters[i]); + INIT_LIST_HEAD(&p->frag_clusters[i]); ++ p->frag_cluster_nr[i] = 0; + } + + for (i = 0; i < swap_header->info.nr_badpages; i++) { +@@ -3187,7 +3266,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + if (!cluster_info) + return nr_extents; + +- + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. +-- +Gitee + + +From da3342ba73e419beb8f4b793ff077b763c27b1df Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:15 +0800 +Subject: [PATCH 10/14] mm: swap: add a adaptive full cluster cache reclaim + +mainline inclusion +from mainline-v6.12-rc1 +commit 2cacbdfdee65b18f9952620e762eab043d71b564 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cacbdfdee65b18f9952620e762eab043d71b564 + +-------------------------------- + +Link all full cluster with one full list, and reclaim from it when the +allocation have ran out of all usable clusters. + +There are many reason a folio can end up being in the swap cache while +having no swap count reference. So the best way to search for such slots +is still by iterating the swap clusters. + +With the list as an LRU, iterating from the oldest cluster and keep them +rotating is a very doable and clean way to free up potentially not inuse +clusters. + +When any allocation failure, try reclaim and rotate only one cluster. +This is adaptive for high order allocations they can tolerate fallback. +So this avoids latency, and give the full cluster list an fair chance to +get reclaimed. It release the usage stress for the fallback order 0 +allocation or following up high order allocation. + +If the swap device is getting very full, reclaim more aggresively to +ensure no OOM will happen. This ensures order 0 heavy workload won't go +OOM as order 0 won't fail if any cluster still have any space. + +[ryncsn@gmail.com: fix discard of full cluster] + Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Cc: David Hildenbrand +Cc: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 2 ++ + mm/swapfile.c | 68 +++++++++++++++++++++++++++++++++++--------- + 2 files changed, 57 insertions(+), 13 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 83b1bcbaf2ec..1664655aa7c8 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -272,6 +272,7 @@ struct swap_cluster_info { + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ + #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ + #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -309,6 +310,7 @@ struct swap_info_struct { + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ ++ struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 45f73b73a92f..389e14f0fc3c 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -439,10 +439,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_NONFULL) +- list_move_tail(&ci->list, &si->discard_clusters); +- else +- list_add_tail(&ci->list, &si->discard_clusters); ++ list_move_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; + schedule_work(&si->discard_work); + } +@@ -452,7 +449,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); + +- if (ci->flags & CLUSTER_FLAG_NONFULL) ++ if (ci->flags) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); +@@ -479,7 +476,6 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); +- + spin_lock(&ci->lock); + __free_cluster(si, ci); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, +@@ -575,12 +571,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_FRAG) { ++ if (ci->flags & CLUSTER_FLAG_FRAG) + p->frag_cluster_nr[ci->order]--; +- list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- } else { +- list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- } ++ list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + } +@@ -673,8 +666,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; +- list_del(&ci->list); +- ci->flags = 0; ++ list_move_tail(&ci->list, &si->full_clusters); ++ ci->flags = CLUSTER_FLAG_FULL; + } + } + +@@ -717,6 +710,46 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + return offset; + } + ++static void swap_reclaim_full_clusters(struct swap_info_struct *si) ++{ ++ long to_scan = 1; ++ unsigned long offset, end; ++ struct swap_cluster_info *ci; ++ unsigned char *map = si->swap_map; ++ int nr_reclaim, total_reclaimed = 0; ++ ++ if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) ++ to_scan = si->inuse_pages / SWAPFILE_CLUSTER; ++ ++ while (!list_empty(&si->full_clusters)) { ++ ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->full_clusters); ++ offset = cluster_offset(si, ci); ++ end = min(si->max, offset + SWAPFILE_CLUSTER); ++ to_scan--; ++ ++ while (offset < end) { ++ if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { ++ spin_unlock(&si->lock); ++ nr_reclaim = __try_to_reclaim_swap(si, offset, ++ TTRS_ANYWAY | TTRS_DIRECT); ++ spin_lock(&si->lock); ++ if (nr_reclaim > 0) { ++ offset += nr_reclaim; ++ total_reclaimed += nr_reclaim; ++ continue; ++ } else if (nr_reclaim < 0) { ++ offset += -nr_reclaim; ++ continue; ++ } ++ } ++ offset++; ++ } ++ if (to_scan <= 0 || total_reclaimed) ++ break; ++ } ++} ++ + /* + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU +@@ -825,7 +858,15 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + goto done; + } + } ++ + done: ++ /* Try reclaim from full clusters if device is nearfull */ ++ if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { ++ swap_reclaim_full_clusters(si); ++ if (!found && !order && si->pages != si->inuse_pages) ++ goto new_cluster; ++ } ++ + cluster->next[order] = offset; + return found; + } +@@ -3217,6 +3258,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + nr_good_pages = maxpages - 1; /* omit header page */ + + INIT_LIST_HEAD(&p->free_clusters); ++ INIT_LIST_HEAD(&p->full_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + + for (i = 0; i < SWAP_NR_ORDERS; i++) { +-- +Gitee + + +From c58f0af4fa7418fdeb2d6b4d1d8751b751649df9 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:16 +0800 +Subject: [PATCH 11/14] mm, swap: fix allocation and scanning race with swapoff + +mainline inclusion +from mainline-v6.12 +commit 0ec8bc9e880eb576dc4492e8e0c7153ed0a71031 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ec8bc9e880eb576dc4492e8e0c7153ed0a71031 + +-------------------------------- + +There are two flags used to synchronize allocation and scanning with +swapoff: SWP_WRITEOK and SWP_SCANNING. + +SWP_WRITEOK: Swapoff will first unset this flag, at this point any further +swap allocation or scanning on this device should just abort so no more +new entries will be referencing this device. Swapoff will then unuse all +existing swap entries. + +SWP_SCANNING: This flag is set when device is being scanned. Swapoff will +wait for all scanner to stop before the final release of the swap device +structures to avoid UAF. Note this flag is the highest used bit of +si->flags so it could be added up arithmetically, if there are multiple +scanner. + +commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from +scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while +separating cluster allocation path from the old allocation path. Add the +flags back to fix swapoff race. The race is hard to trigger as si->lock +prevents most parallel operations, but si->lock could be dropped for +reclaim or discard. This issue is found during code review. + +This commit fixes this problem. For SWP_SCANNING, Just like before, set +the flag before scan and remove it afterwards. + +For SWP_WRITEOK, there are several places where si->lock could be dropped, +it will be error-prone and make the code hard to follow if we try to cover +these places one by one. So just do one check before the real allocation, +which is also very similar like before. With new cluster allocator it may +waste a bit of time iterating the clusters but won't take long, and +swapoff is not performance sensitive. + +Link: https://lkml.kernel.org/r/20241112083414.78174-1-ryncsn@gmail.com +Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()") +Reported-by: "Huang, Ying" +Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/ +Signed-off-by: Kairui Song +Cc: Barry Song +Cc: Chris Li +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 389e14f0fc3c..e620040b9181 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -643,12 +643,15 @@ static bool cluster_scan_range(struct swap_info_struct *si, + return true; + } + +-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) + { + unsigned int nr_pages = 1 << order; + ++ if (!(si->flags & SWP_WRITEOK)) ++ return false; ++ + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); +@@ -669,6 +672,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster + list_move_tail(&ci->list, &si->full_clusters); + ci->flags = CLUSTER_FLAG_FULL; + } ++ ++ return true; + } + + static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, +@@ -692,7 +697,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + + while (offset <= end) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { +- cluster_alloc_range(si, ci, offset, usage, order); ++ if (!cluster_alloc_range(si, ci, offset, usage, order)) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; +@@ -775,7 +783,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); +- VM_BUG_ON(!found); ++ /* ++ * Either we didn't touch the cluster due to swapoff, ++ * or the allocation must success. ++ */ ++ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); + goto done; + } + +@@ -997,6 +1009,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, + + VM_BUG_ON(!si->cluster_info); + ++ si->flags += SWP_SCANNING; ++ + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + +@@ -1005,6 +1019,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, + slots[n_ret++] = swp_entry(si->type, offset); + } + ++ si->flags -= SWP_SCANNING; ++ + return n_ret; + } + +-- +Gitee + + +From 6c0fa586bd1a1b04a8b5bc542e85cee15197075b Mon Sep 17 00:00:00 2001 +From: Jeongjun Park +Date: Wed, 18 Dec 2024 17:51:17 +0800 +Subject: [PATCH 12/14] mm: swap: prevent possible data-race in + __try_to_reclaim_swap + +mainline inclusion +from mainline-v6.12-rc4 +commit 818f916e3a07bf0c64bbf5e250ad209eebe21c85 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=818f916e3a07bf0c64bbf5e250ad209eebe21c85 + +-------------------------------- + +A report [1] was uploaded from syzbot. + +In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to +skip slot cache"), the __try_to_reclaim_swap() function reads offset and +folio->entry from folio without folio_lock protection. + +In the currently reported KCSAN log, it is assumed that the actual +data-race will not occur because the calltrace that does WRITE already +obtains the folio_lock and then writes. + +However, the existing __try_to_reclaim_swap() function was already +implemented to perform reads under folio_lock protection [1], and there is +a risk of a data-race occurring through a function other than the one +shown in the KCSAN log. + +Therefore, I think it is appropriate to change +read operations for folio to be performed under folio_lock. + +[1] + +================================================================== +BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap + +write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: + __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 + delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 + folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 + free_swap_cache mm/swap_state.c:293 [inline] + free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 + __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] + tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] + tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] + tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 + zap_pte_range mm/memory.c:1700 [inline] + zap_pmd_range mm/memory.c:1739 [inline] + zap_pud_range mm/memory.c:1768 [inline] + zap_p4d_range mm/memory.c:1789 [inline] + unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 + unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 + unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 + exit_mmap+0x18a/0x690 mm/mmap.c:1864 + __mmput+0x28/0x1b0 kernel/fork.c:1347 + mmput+0x4c/0x60 kernel/fork.c:1369 + exit_mm+0xe4/0x190 kernel/exit.c:571 + do_exit+0x55e/0x17f0 kernel/exit.c:926 + do_group_exit+0x102/0x150 kernel/exit.c:1088 + get_signal+0xf2a/0x1070 kernel/signal.c:2917 + arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 + exit_to_user_mode_loop kernel/entry/common.c:111 [inline] + exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] + __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] + syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 + do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: + __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 + free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 + zap_pte_range mm/memory.c:1656 [inline] + zap_pmd_range mm/memory.c:1739 [inline] + zap_pud_range mm/memory.c:1768 [inline] + zap_p4d_range mm/memory.c:1789 [inline] + unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 + unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 + unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 + exit_mmap+0x18a/0x690 mm/mmap.c:1864 + __mmput+0x28/0x1b0 kernel/fork.c:1347 + mmput+0x4c/0x60 kernel/fork.c:1369 + exit_mm+0xe4/0x190 kernel/exit.c:571 + do_exit+0x55e/0x17f0 kernel/exit.c:926 + __do_sys_exit kernel/exit.c:1055 [inline] + __se_sys_exit kernel/exit.c:1053 [inline] + __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 + x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 + do_syscall_x64 arch/x86/entry/common.c:52 [inline] + do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +value changed: 0x0000000000000242 -> 0x0000000000000000 + +Link: https://lkml.kernel.org/r/20241007070623.23340-1-aha310510@gmail.com +Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com +Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") +Signed-off-by: Jeongjun Park +Acked-by: Chris Li +Reviewed-by: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index e620040b9181..c5148f16fb53 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -174,9 +174,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + if (IS_ERR(folio)) + return 0; + +- /* offset could point to the middle of a large folio */ +- entry = folio->swap; +- offset = swp_offset(entry); + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + +@@ -190,6 +187,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + if (!folio_trylock(folio)) + goto out; + ++ /* offset could point to the middle of a large folio */ ++ entry = folio->swap; ++ offset = swp_offset(entry); ++ + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); +-- +Gitee + + +From 849e43b208ba22a3ce5dd24388afe85ee6d30e82 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:18 +0800 +Subject: [PATCH 13/14] mm, swap: avoid over reclaim of full clusters + +mainline inclusion +from mainline-v6.12-rc6 +commit 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 + +-------------------------------- + +When running low on usable slots, cluster allocator will try to reclaim +the full clusters aggressively to reclaim HAS_CACHE slots. This +guarantees that as long as there are any usable slots, HAS_CACHE or not, +the swap device will be usable and workload won't go OOM early. + +Before the cluster allocator, swap allocator fails easily if device is +filled up with reclaimable HAS_CACHE slots. Which can be easily +reproduced with following simple program: + + #include + #include + #include + #include + #define SIZE 8192UL * 1024UL * 1024UL + int main(int argc, char **argv) { + long tmp; + char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + memset(p, 0, SIZE); + madvise(p, SIZE, MADV_PAGEOUT); + for (unsigned long i = 0; i < SIZE; ++i) + tmp += p[i]; + getchar(); /* Pause */ + return 0; + } + +Setup an 8G non ramdisk swap, the first run of the program will swapout 8G +ram successfully. But run same program again after the first run paused, +the second run can't swapout all 8G memory as now half of the swap device +is pinned by HAS_CACHE. There was a random scan in the old allocator that +may reclaim part of the HAS_CACHE by luck, but it's unreliable. + +The new allocator's added reclaim of full clusters when device is low on +usable slots. But when multiple CPUs are seeing the device is low on +usable slots at the same time, they ran into a thundering herd problem. + +This is an observable problem on large machine with mass parallel +workload, as full cluster reclaim is slower on large swap device and +higher number of CPUs will also make things worse. + +Testing using a 128G ZRAM on a 48c96t system. When the swap device is +very close to full (eg. 124G / 128G), running build linux kernel with +make -j96 in a 1G memory cgroup will hung (not a softlockup though) +spinning in full cluster reclaim for about ~5min before go OOM. + +To solve this, split the full reclaim into two parts: + +- Instead of do a synchronous aggressively reclaim when device is low, + do only one aggressively reclaim when device is strictly full with a + kworker. This still ensures in worst case the device won't be unusable + because of HAS_CACHE slots. + +- To avoid allocation (especially higher order) suffer from HAS_CACHE + filling up clusters and kworker not responsive enough, do one synchronous + scan every time the free list is drained, and only scan one cluster. This + is kind of similar to the random reclaim before, keeps the full clusters + rotated and has a minimal latency. This should provide a fair reclaim + strategy suitable for most workloads. + +Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com +Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim") +Signed-off-by: Kairui Song +Cc: Barry Song +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Cc: Yosry Ahmed +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +[ Context conflict with commit b85508d7de90. ] +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 1 + + mm/swapfile.c | 49 +++++++++++++++++++++++++++----------------- + 2 files changed, 31 insertions(+), 19 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 1664655aa7c8..33396153afc0 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -348,6 +348,7 @@ struct swap_info_struct { + * list. + */ + struct work_struct discard_work; /* discard worker */ ++ struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ + KABI_RESERVE(1) + KABI_RESERVE(2) +diff --git a/mm/swapfile.c b/mm/swapfile.c +index c5148f16fb53..6f3cbf3a2f0d 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -719,15 +719,16 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + return offset; + } + +-static void swap_reclaim_full_clusters(struct swap_info_struct *si) ++/* Return true if reclaimed a whole cluster */ ++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) + { + long to_scan = 1; + unsigned long offset, end; + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; +- int nr_reclaim, total_reclaimed = 0; ++ int nr_reclaim; + +- if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) ++ if (force) + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + + while (!list_empty(&si->full_clusters)) { +@@ -737,28 +738,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si) + end = min(si->max, offset + SWAPFILE_CLUSTER); + to_scan--; + ++ spin_unlock(&si->lock); + while (offset < end) { + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { +- spin_unlock(&si->lock); + nr_reclaim = __try_to_reclaim_swap(si, offset, + TTRS_ANYWAY | TTRS_DIRECT); +- spin_lock(&si->lock); +- if (nr_reclaim > 0) { +- offset += nr_reclaim; +- total_reclaimed += nr_reclaim; +- continue; +- } else if (nr_reclaim < 0) { +- offset += -nr_reclaim; ++ if (nr_reclaim) { ++ offset += abs(nr_reclaim); + continue; + } + } + offset++; + } +- if (to_scan <= 0 || total_reclaimed) ++ spin_lock(&si->lock); ++ ++ if (to_scan <= 0) + break; + } + } + ++static void swap_reclaim_work(struct work_struct *work) ++{ ++ struct swap_info_struct *si; ++ ++ si = container_of(work, struct swap_info_struct, reclaim_work); ++ ++ spin_lock(&si->lock); ++ swap_reclaim_full_clusters(si, true); ++ spin_unlock(&si->lock); ++} ++ + /* + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU +@@ -792,6 +801,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + goto done; + } + ++ /* Try reclaim from full clusters if free clusters list is drained */ ++ if (vm_swap_full()) ++ swap_reclaim_full_clusters(si, false); ++ + if (order < PMD_ORDER) { + unsigned int frags = 0; + +@@ -873,13 +886,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + } + + done: +- /* Try reclaim from full clusters if device is nearfull */ +- if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { +- swap_reclaim_full_clusters(si); +- if (!found && !order && si->pages != si->inuse_pages) +- goto new_cluster; +- } +- + cluster->next[order] = offset; + return found; + } +@@ -914,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + si->lowest_bit = si->max; + si->highest_bit = 0; + del_from_avail_list(si); ++ ++ if (vm_swap_full()) ++ schedule_work(&si->reclaim_work); + } + } + +@@ -2846,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + wait_for_completion(&p->comp); + + flush_work(&p->discard_work); ++ flush_work(&p->reclaim_work); + + destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) +@@ -3382,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); ++ INIT_WORK(&p->reclaim_work, swap_reclaim_work); + + name = getname(specialfile); + if (IS_ERR(name)) { +-- +Gitee + + +From f19bcc77fc060549322618028b1ab9df253474ea Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:51:19 +0800 +Subject: [PATCH 14/14] mm: swapfile: fix cluster reclaim work crash on + rotational devices + +mainline inclusion +from mainline-v6.12 +commit dcf32ea7ecede94796fb30231b3969d7c838374c +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dcf32ea7ecede94796fb30231b3969d7c838374c + +-------------------------------- + +syzbot and Daan report a NULL pointer crash in the new full swap cluster +reclaim work: + +> Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI +> KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] +> CPU: 1 UID: 0 PID: 51 Comm: kworker/1:1 Not tainted 6.12.0-rc6-syzkaller #0 +> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 +> Workqueue: events swap_reclaim_work +> RIP: 0010:__list_del_entry_valid_or_report+0x20/0x1c0 lib/list_debug.c:49 +> Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 89 fe 48 83 c7 08 48 83 ec 18 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 19 01 00 00 48 89 f2 48 8b 4e 08 48 b8 00 00 00 +> RSP: 0018:ffffc90000bb7c30 EFLAGS: 00010202 +> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff88807b9ae078 +> RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000008 +> RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 +> R10: 0000000000000001 R11: 000000000000004f R12: dffffc0000000000 +> R13: ffffffffffffffb8 R14: ffff88807b9ae000 R15: ffffc90003af1000 +> FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 +> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +> CR2: 00007fffaca68fb8 CR3: 00000000791c8000 CR4: 00000000003526f0 +> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +> Call Trace: +> +> __list_del_entry_valid include/linux/list.h:124 [inline] +> __list_del_entry include/linux/list.h:215 [inline] +> list_move_tail include/linux/list.h:310 [inline] +> swap_reclaim_full_clusters+0x109/0x460 mm/swapfile.c:748 +> swap_reclaim_work+0x2e/0x40 mm/swapfile.c:779 + +The syzbot console output indicates a virtual environment where swapfile +is on a rotational device. In this case, clusters aren't actually used, +and si->full_clusters is not initialized. Daan's report is from qemu, so +likely rotational too. + +Make sure to only schedule the cluster reclaim work when clusters are +actually in use. + +Link: https://lkml.kernel.org/r/20241107142335.GB1172372@cmpxchg.org +Link: https://lore.kernel.org/lkml/672ac50b.050a0220.2edce.1517.GAE@google.com/ +Link: https://github.com/systemd/systemd/issues/35044 +Fixes: 5168a68eb78f ("mm, swap: avoid over reclaim of full clusters") +Reported-by: syzbot+078be8bfa863cb9e0c6b@syzkaller.appspotmail.com +Signed-off-by: Johannes Weiner +Reported-by: Daan De Meyer +Cc: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 6f3cbf3a2f0d..3b48159820f2 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -921,7 +921,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + si->highest_bit = 0; + del_from_avail_list(si); + +- if (vm_swap_full()) ++ if (si->cluster_info && vm_swap_full()) + schedule_work(&si->reclaim_work); + } + } +-- +Gitee + diff --git a/kernel.spec b/kernel.spec index 9e7c04d..41a61f1 100644 --- a/kernel.spec +++ b/kernel.spec @@ -1,5 +1,5 @@ %define with_signmodules 1 -%define with_kabichk 1 +%define with_kabichk 0 # Default without toolchain_clang %bcond_with toolchain_clang @@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig %global upstream_sublevel 0 %global devel_release 68 %global maintenance_release .0.0 -%global pkg_release .74 +%global pkg_release .77 %global openeuler_lts 1 %global openeuler_major 2403 @@ -130,6 +130,25 @@ Patch0001: 0001-riscv-kernel.patch Patch0002: 0002-cpupower-clang-compile-support.patch Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch Patch0004: 0004-turbostat-clang-compile-support.patch +Patch0005: 0005-include-msi-modify-kabi-size-of-msi_desc.patch +Patch0007: 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch +Patch0008: 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch +Patch0009: 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch +Patch0010: 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch +Patch0012: 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch +Patch0013: 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch +Patch0014: 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch +Patch0015: 0015-statx-kabi-KABI-reservation-for-kstat.patch +Patch0016: 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch +Patch0017: 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch +Patch0018: 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch +Patch0019: 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch +Patch0020: 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch +Patch0021: 0021-cgroup-add-more-reserve-kabi.patch +Patch0022: 0022-14223.patch +Patch0023: 0023-14224.patch +Patch0024: 0024-14225.patch +Patch0026: 0026-14227.patch #BuildRequires: BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar @@ -332,6 +351,26 @@ tar -xjf %{SOURCE9998} mv kernel linux-%{KernelVer} cd linux-%{KernelVer} +%patch0005 -p1 +%patch0007 -p1 +%patch0008 -p1 +%patch0009 -p1 +%patch0010 -p1 +%patch0012 -p1 +%patch0013 -p1 +%patch0014 -p1 +%patch0015 -p1 +%patch0016 -p1 +%patch0017 -p1 +%patch0018 -p1 +%patch0019 -p1 +%patch0020 -p1 +%patch0021 -p1 +%patch0022 -p1 +%patch0023 -p1 +%patch0024 -p1 +%patch0026 -p1 + %if 0%{?with_patch} cp %{SOURCE9000} . cp %{SOURCE9001} . @@ -1092,6 +1131,9 @@ fi %endif %changelog +* Thu Dec 19 2024 Zheng Zengkai - 6.6.0-68.0.0.77 +- performance test for kabi exclude sched + * Wed Dec 18 2024 Liu Yanze - 6.6.0-68.0.0.74 - kabi: add kabi_ext2 list for checking - kernel.spec: fix with_kabichk on non-arm64 platform -- Gitee