From b6a62a5558490d727b2c4aa4fc964dcde1510ae8 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:20 +0800 Subject: [PATCH 01/27] mm: reliable: Return corrent errno in reliable_check hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- LTP's proc01 test was failing be because this ret code (1): proc01 1 TFAIL : proc01.c:400: read failed: /proc/self/task/1406366/reliable: errno=EPERM(1): Operation not permitted To slove this problem, return corrent errno in reliable_check(). Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- fs/proc/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b4666e757f0..cc1fdff2e136 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1264,14 +1264,14 @@ static const struct file_operations proc_oom_score_adj_operations = { static inline int reliable_check(struct task_struct *task, struct pid *pid) { if (!mem_reliable_is_enabled()) - return -EPERM; + return -EACCES; if (is_global_init(task)) - return -EPERM; + return -EINVAL; if (!task->mm || (task->flags & PF_KTHREAD) || (task->flags & PF_EXITING)) - return -EPERM; + return -EINVAL; return 0; } -- Gitee From 55ac3d0652e16a7d9c0e0c172f4542f67c0f34d1 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:21 +0800 Subject: [PATCH 02/27] mm: Drop shmem reliable related log during startup hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Message "shmem reliable disabled." will be printed if memory reliable is disabled. This is not necessary so drop it. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index b1bc749532a4..15a84c0f714e 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -101,13 +101,8 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) void shmem_reliable_init(void) { - if (!shmem_reliable_is_enabled()) - return; - - if (!mem_reliable_is_enabled()) { + if (!mem_reliable_is_enabled() || !shmem_reliable_is_enabled()) shmem_reliable = false; - pr_info("shmem reliable disabled.\n"); - } } void reliable_report_meminfo(struct seq_file *m) -- Gitee From b35711297503445fa24c78d195f070ccb2f6d260 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:22 +0800 Subject: [PATCH 03/27] mm: Export static key mem_reliable hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Static key mem_reliable is used to check wheater memory reliable's status in kernel's inline functions. These inline function rely on this but dirver can not use because this symbol is not exported. To slove this problem, export this symbol to make prepration for driver to use memory reliable's inline function. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 2 +- mm/mem_reliable.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 38891cb2fa83..c3a9a4f21470 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -10,7 +10,7 @@ #ifdef CONFIG_MEMORY_RELIABLE -extern struct static_key_false mem_reliable; +DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 15a84c0f714e..083997234e01 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -9,6 +9,7 @@ #include DEFINE_STATIC_KEY_FALSE(mem_reliable); +EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; -- Gitee From 64f874c82b31b453af50f8803d0cca349cc5f843 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:23 +0800 Subject: [PATCH 04/27] mm: Export mem_reliable_status() for checking memory reliable status hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Export the mem_reliable_status(), so it can be used by others to check memory reliable's status. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 2 ++ mm/mem_reliable.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index c3a9a4f21470..b75feac5e33c 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -21,6 +21,7 @@ extern void mem_reliable_init(bool has_unmirrored_mem, extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); extern void page_cache_prepare_alloc(gfp_t *gfp); +extern bool mem_reliable_status(void); static inline bool mem_reliable_is_enabled(void) { @@ -70,6 +71,7 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } static inline void page_cache_prepare_alloc(gfp_t *gfp) {} +static inline bool mem_reliable_status(void) { return false; } #endif #endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 083997234e01..fe6f47a0b4cd 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -16,6 +16,12 @@ bool reliable_enabled; static atomic_long_t total_reliable_mem; bool shmem_reliable __read_mostly = true; +bool mem_reliable_status(void) +{ + return mem_reliable_is_enabled(); +} +EXPORT_SYMBOL_GPL(mem_reliable_status); + void page_cache_prepare_alloc(gfp_t *gfp) { if (mem_reliable_is_enabled()) -- Gitee From 5baa7afdb688fcff8de12df0d67e084482aad86d Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:24 +0800 Subject: [PATCH 05/27] mm: Refactor code in reliable_report_meminfo() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Use show_val_kb() to format meminfo. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index fe6f47a0b4cd..d46fe86563bd 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -112,13 +112,19 @@ void shmem_reliable_init(void) shmem_reliable = false; } +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); + seq_write(m, " kB\n", 4); +} + void reliable_report_meminfo(struct seq_file *m) { if (!mem_reliable_is_enabled()) return; - seq_printf(m, "ReliableTotal: %8lu kB\n", - total_reliable_mem_sz() >> 10); - seq_printf(m, "ReliableUsed: %8lu kB\n", - used_reliable_mem_sz() >> 10); + show_val_kb(m, "ReliableTotal: ", + total_reliable_mem_sz() >> PAGE_SHIFT); + show_val_kb(m, "ReliableUsed: ", + used_reliable_mem_sz() >> PAGE_SHIFT); } -- Gitee From 4623cc86dab254a236933f3df7af191eca9de3f7 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:25 +0800 Subject: [PATCH 06/27] mm: Count reliable memory info based on zone info hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Count reliable memory info based on zone info. Any zone below ZONE_MOVABLE is seed as reliable zone and sum the pages there. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- Documentation/filesystems/proc.rst | 3 ++ include/linux/mem_reliable.h | 14 ++---- mm/mem_reliable.c | 73 ++++++++++-------------------- mm/page_alloc.c | 6 ++- 4 files changed, 34 insertions(+), 62 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index f6783bb99e3f..6ae531ee4de9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -971,6 +971,7 @@ varies by architecture and compile options. The following is from a ShmemPmdMapped: 0 kB ReliableTotal: 7340032 kB ReliableUsed: 418824 kB + ReliableBuddyMem: 418824 kB MemTotal Total usable RAM (i.e. physical RAM minus a few reserved @@ -1104,6 +1105,8 @@ ReliableTotal Total reliable memory size ReliableUsed The used amount of reliable memory +ReliableBuddyMem + Size of unused mirrored memory in buddy system vmallocinfo ~~~~~~~~~~~ diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index b75feac5e33c..7b22229068f1 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -15,9 +15,9 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; -extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, - unsigned long *zone_movable_pfn); + unsigned long *zone_movable_pfn, + unsigned long mirrored_sz); extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); extern void page_cache_prepare_alloc(gfp_t *gfp); @@ -28,11 +28,6 @@ static inline bool mem_reliable_is_enabled(void) return static_branch_likely(&mem_reliable); } -static inline bool zone_reliable(struct zone *zone) -{ - return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; -} - static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { if (!mem_reliable_is_enabled()) @@ -59,11 +54,10 @@ static inline bool shmem_reliable_is_enabled(void) #define reliable_enabled 0 static inline bool mem_reliable_is_enabled(void) { return false; } -static inline void add_reliable_mem_size(long sz) {} static inline void mem_reliable_init(bool has_unmirrored_mem, - unsigned long *zone_movable_pfn) {} + unsigned long *zone_movable_pfn, + unsigned long mirrored_sz) {} static inline void shmem_reliable_init(void) {} -static inline bool zone_reliable(struct zone *zone) { return false; } static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { return false; diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index d46fe86563bd..876335fc4060 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -8,12 +8,12 @@ #include #include +#define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) + DEFINE_STATIC_KEY_FALSE(mem_reliable); EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; - -static atomic_long_t total_reliable_mem; bool shmem_reliable __read_mostly = true; bool mem_reliable_status(void) @@ -28,62 +28,42 @@ void page_cache_prepare_alloc(gfp_t *gfp) *gfp |= GFP_RELIABLE; } -void add_reliable_mem_size(long sz) -{ - atomic_long_add(sz, &total_reliable_mem); -} - -static unsigned long total_reliable_mem_sz(void) -{ - return atomic_long_read(&total_reliable_mem); -} - -static unsigned long used_reliable_mem_sz(void) +static unsigned long total_reliable_pages(void) { - unsigned long nr_page = 0; + unsigned long total_reliable_pages = 0; struct zone *z; for_each_populated_zone(z) if (zone_idx(z) < ZONE_MOVABLE) - nr_page += zone_page_state(z, NR_FREE_PAGES); + total_reliable_pages += zone_managed_pages(z); - return total_reliable_mem_sz() - nr_page * PAGE_SIZE; + return total_reliable_pages; } -static int reliable_mem_notifier(struct notifier_block *nb, - unsigned long action, void *arg) +static unsigned long free_reliable_pages(void) { - struct memory_notify *m_arg = arg; struct zone *zone; + unsigned long cnt = 0; - switch (action) { - case MEM_ONLINE: - zone = page_zone(pfn_to_page(m_arg->start_pfn)); - if (zone_reliable(zone)) - add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); - break; - case MEM_OFFLINE: - zone = page_zone(pfn_to_page(m_arg->start_pfn)); - if (zone_reliable(zone)) - add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); - break; - default: - break; - } + for_each_populated_zone(zone) + if (zone_idx(zone) < ZONE_MOVABLE) + cnt += zone_page_state(zone, NR_FREE_PAGES); - return NOTIFY_OK; + return cnt; } -static struct notifier_block reliable_notifier_block = { - .notifier_call = reliable_mem_notifier, -}; +static unsigned long used_reliable_pages(void) +{ + return total_reliable_pages() - free_reliable_pages(); +} -void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, + unsigned long mirrored_sz) { if (!reliable_enabled) return; - if (atomic_long_read(&total_reliable_mem) == 0) { + if (!mirrored_sz) { memset(zone_movable_pfn, 0, sizeof(unsigned long) * MAX_NUMNODES); pr_err("init failed, mirrored memory size is zero.\n"); @@ -95,15 +75,9 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) return; } - if (register_hotmemory_notifier(&reliable_notifier_block)) { - pr_err("init failed, register memory notifier failed.\n"); - return; - } - static_branch_enable(&mem_reliable); - pr_info("init succeed, mirrored memory size(%lu)\n", - total_reliable_mem_sz()); + pr_info("init succeed, mirrored memory size(%lu)\n", mirrored_sz); } void shmem_reliable_init(void) @@ -123,8 +97,7 @@ void reliable_report_meminfo(struct seq_file *m) if (!mem_reliable_is_enabled()) return; - show_val_kb(m, "ReliableTotal: ", - total_reliable_mem_sz() >> PAGE_SHIFT); - show_val_kb(m, "ReliableUsed: ", - used_reliable_mem_sz() >> PAGE_SHIFT); + show_val_kb(m, "ReliableTotal: ", total_reliable_pages()); + show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); + show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index efa0d4479e6e..d9a5402b1552 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7529,10 +7529,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; bool has_unmirrored_mem = false; + unsigned long mirrored_sz = 0; for_each_mem_region(r) { if (memblock_is_mirror(r)) { - add_reliable_mem_size(r->size); + mirrored_sz += r->size; continue; } @@ -7554,7 +7555,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.\n"); - mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + mem_reliable_init(has_unmirrored_mem, zone_movable_pfn, + mirrored_sz); goto out2; } -- Gitee From 7023dd3caa97bf42b72f934ee79af43a0917a7b2 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:26 +0800 Subject: [PATCH 07/27] mm: Disable memory reliable when kdump is in progress hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Kdump only have limited memory and will lead to bugly memory reliable features if memory reliable if enabled. So disable memory reliable if kdump is in progress. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 876335fc4060..06d11cee52b6 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -7,6 +7,7 @@ #include #include #include +#include #define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) @@ -63,6 +64,11 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, if (!reliable_enabled) return; + if (is_kdump_kernel()) { + pr_info("ignoring memory reliable due to in crashkernel\n"); + return; + } + if (!mirrored_sz) { memset(zone_movable_pfn, 0, sizeof(unsigned long) * MAX_NUMNODES); -- Gitee From a1bdc2e22aa6a9010cf8c02b1ec2d2c4231f9cc3 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:27 +0800 Subject: [PATCH 08/27] mm: Clear GFP_RELIABLE if the conditions are not met hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Memory reliable only handle memory allocation from movable zone. GFP_RELIABLE will be removed if the conditions are not met. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/page_alloc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9a5402b1552..1313c112a30c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5148,11 +5148,26 @@ EXPORT_SYMBOL_GPL(__alloc_pages_bulk); static inline void prepare_before_alloc(gfp_t *gfp_mask) { + bool zone_movable; + if (!mem_reliable_is_enabled()) - return; + goto clear_flag; + + /* + * memory reliable only handle memory allocation from movable zone + * (force alloc from non-movable zone or force alloc from movable + * zone) to get total isolation. + */ + zone_movable = gfp_zone(*gfp_mask & ~GFP_RELIABLE) == ZONE_MOVABLE; + if (!zone_movable) + goto clear_flag; if ((current->flags & PF_RELIABLE) || is_global_init(current)) *gfp_mask |= GFP_RELIABLE; + + return; +clear_flag: + *gfp_mask &= ~GFP_RELIABLE; } /* -- Gitee From be9e2144b003aa1ac25a4d2c34fcc830e67c4570 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 11 Nov 2022 09:32:28 +0800 Subject: [PATCH 09/27] mm: Add kernel param for memory reliable hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add kernel param reliable_debug in reparation for control memory reliable features. Signed-off-by: Peng Wu Reviewed-by: Kefeng Wang --- .../admin-guide/kernel-parameters.txt | 5 ++++ mm/mem_reliable.c | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fd3241d15568..4da9cb4dd234 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4785,6 +4785,11 @@ [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. + reliable_debug= [ARM64] + Format: [] + Only works with CONFIG_MEMORY_RELIABLE and + "kernelcore=reliable" is configured. + reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] Reserve I/O ports or memory so the kernel won't use diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 06d11cee52b6..e1b6a1002933 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -107,3 +107,27 @@ void reliable_report_meminfo(struct seq_file *m) show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); } + +static int __init setup_reliable_debug(char *str) +{ + if (*str++ != '=' || !*str) + /* + * No options specified. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (*str) { + default: + pr_err("reliable_debug option '%c' unknown. skipped\n", + *str); + } + } + +out: + return 1; +} +__setup("reliable_debug", setup_reliable_debug); -- Gitee From 33c4a18f939b279275b2d137e884553f78ff99db Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 11 Nov 2022 09:32:29 +0800 Subject: [PATCH 10/27] mm: Add cmdline for the reliable memory usage of page cache hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add cmdline for the reliable memory usage of page cache. Page cache will not use reliable memory when passing option "P" to reliable_debug in cmdline. Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang --- .../admin-guide/kernel-parameters.txt | 3 ++- include/linux/mem_reliable.h | 8 ++++++++ mm/mem_reliable.c | 18 ++++++++++++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4da9cb4dd234..15c7bceb268c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4786,9 +4786,10 @@ See Documentation/admin-guide/cgroup-v1/cpusets.rst. reliable_debug= [ARM64] - Format: [] + Format: [P] Only works with CONFIG_MEMORY_RELIABLE and "kernelcore=reliable" is configured. + P: Page cache does not use the reliable memory. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 7b22229068f1..857881682ea3 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -14,6 +14,7 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; +extern bool pagecache_use_reliable_mem; extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, @@ -28,6 +29,11 @@ static inline bool mem_reliable_is_enabled(void) return static_branch_likely(&mem_reliable); } +static inline bool pagecache_reliable_is_enabled(void) +{ + return pagecache_use_reliable_mem; +} + static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { if (!mem_reliable_is_enabled()) @@ -52,8 +58,10 @@ static inline bool shmem_reliable_is_enabled(void) } #else #define reliable_enabled 0 +#define pagecache_use_reliable_mem 0 static inline bool mem_reliable_is_enabled(void) { return false; } +static inline bool pagecache_reliable_is_enabled(void) { return false; } static inline void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, unsigned long mirrored_sz) {} diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index e1b6a1002933..5d75ab6482db 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -16,6 +16,7 @@ EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; bool shmem_reliable __read_mostly = true; +bool pagecache_use_reliable_mem __read_mostly = true; bool mem_reliable_status(void) { @@ -25,8 +26,17 @@ EXPORT_SYMBOL_GPL(mem_reliable_status); void page_cache_prepare_alloc(gfp_t *gfp) { - if (mem_reliable_is_enabled()) - *gfp |= GFP_RELIABLE; + if (!mem_reliable_is_enabled()) + return; + + if (!pagecache_reliable_is_enabled()) + goto no_reliable; + + *gfp |= GFP_RELIABLE; + return; + +no_reliable: + *gfp &= ~GFP_RELIABLE; } static unsigned long total_reliable_pages(void) @@ -121,6 +131,10 @@ static int __init setup_reliable_debug(char *str) */ for (; *str && *str != ','; str++) { switch (*str) { + case 'P': + pagecache_use_reliable_mem = false; + pr_info("disable page cache use reliable memory\n"); + break; default: pr_err("reliable_debug option '%c' unknown. skipped\n", *str); -- Gitee From ccad5e7a0a77cd30b08dc5d232674d5a68966889 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 11 Nov 2022 09:32:30 +0800 Subject: [PATCH 11/27] proc/meminfo: Add "FileCache" item in /proc/meminfo hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Item "FileCache" in /proc/meminfo show the number of page cache in LRU(active + inactive). Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 5d75ab6482db..6c0b931b9071 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -116,6 +116,14 @@ void reliable_report_meminfo(struct seq_file *m) show_val_kb(m, "ReliableTotal: ", total_reliable_pages()); show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); + + if (pagecache_reliable_is_enabled()) { + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + show_val_kb(m, "FileCache: ", num); + } } static int __init setup_reliable_debug(char *str) -- Gitee From 6e6cf0d7d674254ed369515aafce94c238b9897d Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 11 Nov 2022 09:32:31 +0800 Subject: [PATCH 12/27] mm: add "ReliableFileCache" item in /proc/meminfo hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add statistics for usage of reliable page cache, Item "ReliableFileCache" in /proc/meminfo show the usage of reliable page cache. Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 24 +++++++++++++++ include/linux/mm.h | 6 ++-- include/linux/mm_inline.h | 4 +++ include/linux/mmzone.h | 5 ++++ mm/mem_reliable.c | 57 ++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 2 ++ 6 files changed, 95 insertions(+), 3 deletions(-) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 857881682ea3..59108e955f48 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -15,6 +15,8 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; extern bool pagecache_use_reliable_mem; +extern struct percpu_counter pagecache_reliable_pages; +extern struct percpu_counter anon_reliable_pages; extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, @@ -23,6 +25,11 @@ extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); extern void page_cache_prepare_alloc(gfp_t *gfp); extern bool mem_reliable_status(void); +extern void reliable_lru_add(enum lru_list lru, struct page *page, + int val); +extern void reliable_lru_add_batch(int zid, enum lru_list lru, + int val); +extern bool mem_reliable_counter_initialized(void); static inline bool mem_reliable_is_enabled(void) { @@ -56,6 +63,17 @@ static inline bool shmem_reliable_is_enabled(void) { return shmem_reliable; } + +static inline bool page_reliable(struct page *page) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!page) + return false; + + return page_zonenum(page) < ZONE_MOVABLE; +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -74,6 +92,12 @@ static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } static inline void page_cache_prepare_alloc(gfp_t *gfp) {} static inline bool mem_reliable_status(void) { return false; } +static inline bool page_reliable(struct page *page) { return false; } +static inline void reliable_lru_add(enum lru_list lru, struct page *page, + int val) {} +static inline void reliable_lru_add_batch(int zid, enum lru_list lru, + int val) {} +static inline bool mem_reliable_counter_initialized(void) { return false; } #endif #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 839106c9f708..ed66c81cf747 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,9 +34,6 @@ #include #include -/* added to mm.h to avoid every caller adding new header file */ -#include - struct mempolicy; struct anon_vma; struct anon_vma_chain; @@ -3313,5 +3310,8 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +/* added to mm.h to avoid every caller adding new header file */ +#include + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8fc71e9d7bb0..36f2e8f7db9d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * page_is_file_lru - should the page be on a file LRU or anon LRU? @@ -50,6 +51,7 @@ static __always_inline void add_page_to_lru_list(struct page *page, { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, thp_nr_pages(page)); } static __always_inline void add_page_to_lru_list_tail(struct page *page, @@ -57,6 +59,7 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, thp_nr_pages(page)); } static __always_inline void del_page_from_lru_list(struct page *page, @@ -64,6 +67,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, { list_del(&page->lru); update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); + reliable_lru_add(lru, page, -thp_nr_pages(page)); } /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8719d891848f..7f25539d2fe4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -261,6 +261,11 @@ static inline bool is_file_lru(enum lru_list lru) return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } +static inline int is_anon_lru(enum lru_list lru) +{ + return (lru == LRU_INACTIVE_ANON || lru == LRU_ACTIVE_ANON); +} + static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 6c0b931b9071..4d49da71809b 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -17,6 +17,14 @@ EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; bool shmem_reliable __read_mostly = true; bool pagecache_use_reliable_mem __read_mostly = true; +struct percpu_counter pagecache_reliable_pages; +struct percpu_counter anon_reliable_pages; + +bool mem_reliable_counter_initialized(void) +{ + return likely(percpu_counter_initialized(&pagecache_reliable_pages)) && + likely((percpu_counter_initialized(&anon_reliable_pages))); +} bool mem_reliable_status(void) { @@ -24,6 +32,37 @@ bool mem_reliable_status(void) } EXPORT_SYMBOL_GPL(mem_reliable_status); +void reliable_lru_add_batch(int zid, enum lru_list lru, + int val) +{ + if (!mem_reliable_is_enabled()) + return; + + if (zid < ZONE_MOVABLE) { + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + } +} + +void reliable_lru_add(enum lru_list lru, struct page *page, int val) +{ + if (!page_reliable(page)) + return; + + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + else if (lru == LRU_UNEVICTABLE) { + if (PageAnon(page)) + percpu_counter_add(&anon_reliable_pages, val); + else + percpu_counter_add(&pagecache_reliable_pages, val); + } +} + void page_cache_prepare_alloc(gfp_t *gfp) { if (!mem_reliable_is_enabled()) @@ -118,14 +157,32 @@ void reliable_report_meminfo(struct seq_file *m) show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); if (pagecache_reliable_is_enabled()) { + s64 nr_pagecache_pages = 0; unsigned long num = 0; num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); show_val_kb(m, "FileCache: ", num); + + nr_pagecache_pages = + percpu_counter_sum_positive(&pagecache_reliable_pages); + seq_printf(m, "ReliableFileCache: %8llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); } } +static int __init reliable_sysctl_init(void) +{ + if (!mem_reliable_is_enabled()) + return 0; + + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); + percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); + + return 0; +} +arch_initcall(reliable_sysctl_init); + static int __init setup_reliable_debug(char *str) { if (*str++ != '=' || !*str) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9e76887a84d3..a98566925f9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1813,6 +1813,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + reliable_lru_add_batch(zid, lru, -nr_zone_taken[zid]); } } @@ -2082,6 +2083,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_add(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, nr_pages); nr_moved += nr_pages; if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); -- Gitee From 4021a0d53e3e0debd2399e4dbb1b1ae1620c0a3d Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 11 Nov 2022 09:32:32 +0800 Subject: [PATCH 13/27] mm: Add support for limiting the usage of reliable memory in pagecache hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Add interface /proc/sys/vm/reliable_pagecache_max_bytes to set the max size for reliable page cache, the max size cant beyond total reliable ram. the whole reliable memory feature depend on kernelcore=mirror, and which depend on NUMA, so remove redundant code in UMA. Signed-off-by: Chen Wandun Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 4d49da71809b..8bb53713450c 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -19,6 +19,7 @@ bool shmem_reliable __read_mostly = true; bool pagecache_use_reliable_mem __read_mostly = true; struct percpu_counter pagecache_reliable_pages; struct percpu_counter anon_reliable_pages; +static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; bool mem_reliable_counter_initialized(void) { @@ -65,12 +66,18 @@ void reliable_lru_add(enum lru_list lru, struct page *page, int val) void page_cache_prepare_alloc(gfp_t *gfp) { + s64 nr_reliable = 0; + if (!mem_reliable_is_enabled()) return; if (!pagecache_reliable_is_enabled()) goto no_reliable; + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); + if (nr_reliable > reliable_pagecache_max_bytes >> PAGE_SHIFT) + goto no_reliable; + *gfp |= GFP_RELIABLE; return; @@ -171,11 +178,56 @@ void reliable_report_meminfo(struct seq_file *m) } } +static int reliable_pagecache_max_bytes_write(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long old_value = reliable_pagecache_max_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (reliable_pagecache_max_bytes > + PAGES_TO_B(total_reliable_pages())) { + reliable_pagecache_max_bytes = old_value; + return -EINVAL; + } + } + + return ret; +} + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "reliable_pagecache_max_bytes", + .data = &reliable_pagecache_max_bytes, + .maxlen = sizeof(reliable_pagecache_max_bytes), + .mode = 0644, + .proc_handler = reliable_pagecache_max_bytes_write, + }, + {} +}; + +static struct ctl_table reliable_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = reliable_ctl_table, + }, + {} +}; + static int __init reliable_sysctl_init(void) { if (!mem_reliable_is_enabled()) return 0; + if (!register_sysctl_table(reliable_dir_table)) { + pr_err("register sysctl failed."); + return -ENOMEM; + } + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); -- Gitee From 6263994761a3d3ad6e5fa3beff9ca8cbd38d3bf3 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:33 +0800 Subject: [PATCH 14/27] mm: thp: Add memory reliable support for hugepaged collapse hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Hugepaged collapse pages into huge page will use the same memory region. When hugepaged collapse pages into huge page, hugepaged will check if there is any reliable pages in the area to be collapsed. If this area contains any reliable pages, hugepaged will alloc memory from mirrored region. Otherwise it will alloc momory from non-mirrored region. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/khugepaged.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 44c048d7b783..254211e56153 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1057,7 +1057,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced, int unmapped) + int node, int referenced, int unmapped, + bool reliable) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1075,6 +1076,9 @@ static void collapse_huge_page(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + if (reliable) + gfp |= GFP_RELIABLE; + /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1234,6 +1238,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; + bool reliable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1358,6 +1363,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (page_reliable(page)) + reliable = true; } if (!writable) { result = SCAN_PAGE_RO; @@ -1373,7 +1381,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_lock released */ collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + referenced, unmapped, reliable); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1633,7 +1641,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ static void collapse_file(struct mm_struct *mm, struct file *file, pgoff_t start, - struct page **hpage, int node) + struct page **hpage, int node, + bool reliable) { struct address_space *mapping = file->f_mapping; gfp_t gfp; @@ -1650,6 +1659,9 @@ static void collapse_file(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + if (reliable) + gfp |= GFP_RELIABLE; + new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; @@ -1977,6 +1989,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; + bool reliable = false; present = 0; swap = 0; @@ -2029,6 +2042,9 @@ static void khugepaged_scan_file(struct mm_struct *mm, xas_pause(&xas); cond_resched_rcu(); } + + if (page_reliable(page)) + reliable = true; } rcu_read_unlock(); @@ -2037,7 +2053,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + collapse_file(mm, file, start, hpage, node, reliable); } } -- Gitee From 8968270eb5fbedb9c62e265e49228ef59a4f2321 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:34 +0800 Subject: [PATCH 15/27] mm: Add reliable memory use limit for user tasks hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- There is a upper limit for all memory allocation if the following condtions are met: - gfp_zone(gfp & ~ GFP_RELIABLE) == ZONE_MOVABLE - gfp & GFP_RELIABLE is true Init tasks will alloc memory from non-mirrored region if their allocation trigger limit. The limit can be set or access via /proc/sys/vm/task_reliable_limit This limit's default value is ULONG_MAX. User can update this value between current user used reliable memory size and total reliable memory size. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 40 +++++++++++++++++ mm/mem_reliable.c | 46 +++++++++++++++++++ mm/page_alloc.c | 87 ++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 59108e955f48..9b94154b383e 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #ifdef CONFIG_MEMORY_RELIABLE @@ -17,6 +19,7 @@ extern bool shmem_reliable; extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; +extern unsigned long task_reliable_limit __read_mostly; extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, @@ -30,6 +33,8 @@ extern void reliable_lru_add(enum lru_list lru, struct page *page, extern void reliable_lru_add_batch(int zid, enum lru_list lru, int val); extern bool mem_reliable_counter_initialized(void); +extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask); static inline bool mem_reliable_is_enabled(void) { @@ -74,6 +79,31 @@ static inline bool page_reliable(struct page *page) return page_zonenum(page) < ZONE_MOVABLE; } + +static inline u64 task_reliable_used_pages(void) +{ + s64 nr_pages; + + nr_pages = percpu_counter_read_positive(&pagecache_reliable_pages); + nr_pages += percpu_counter_read_positive(&anon_reliable_pages); + + return nr_pages; +} + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return (task_reliable_used_pages() + nr_page) <= + (task_reliable_limit >> PAGE_SHIFT); +} + +static inline bool mem_reliable_should_reclaim(void) +{ + if (percpu_counter_sum_positive(&pagecache_reliable_pages) >= + MAX_ORDER_NR_PAGES) + return true; + + return false; +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -98,6 +128,16 @@ static inline void reliable_lru_add(enum lru_list lru, struct page *page, static inline void reliable_lru_add_batch(int zid, enum lru_list lru, int val) {} static inline bool mem_reliable_counter_initialized(void) { return false; } +static inline u64 task_reliable_used_pages(void) { return 0; } +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline bool mem_reliable_should_reclaim(void) { return false; } +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} #endif #endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 8bb53713450c..636e51a261ee 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -20,6 +20,8 @@ bool pagecache_use_reliable_mem __read_mostly = true; struct percpu_counter pagecache_reliable_pages; struct percpu_counter anon_reliable_pages; static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX; bool mem_reliable_counter_initialized(void) { @@ -178,6 +180,26 @@ void reliable_report_meminfo(struct seq_file *m) } } +static int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) || + task_reliable_limit < + (task_reliable_used_pages() << PAGE_SHIFT)) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + static int reliable_pagecache_max_bytes_write(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -205,6 +227,13 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0644, .proc_handler = reliable_pagecache_max_bytes_write, }, + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, {} }; @@ -235,6 +264,23 @@ static int __init reliable_sysctl_init(void) } arch_initcall(reliable_sysctl_init); +void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} + static int __init setup_reliable_debug(char *str) { if (*str++ != '=' || !*str) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1313c112a30c..45ade2b73e1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5170,6 +5170,89 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask) *gfp_mask &= ~GFP_RELIABLE; } +static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac) +{ + long nr_reclaimed = 0; + + while (nr_reclaimed < nr_pages) { + /* try to free cache from reliable region */ + long progress = __perform_reclaim(GFP_KERNEL, 0, ac); + + nr_reclaimed += progress; + if (progress < SWAP_CLUSTER_MAX) + break; + } + + return nr_reclaimed; +} + +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, + int preferred_nid, + struct alloc_context *ac, + struct page **_page) +{ + int retry_times = MAX_RECLAIM_RETRIES; + int nr_pages; + + if (!mem_reliable_is_enabled()) + return false; + + if (!(*gfp & GFP_RELIABLE)) + return false; + + if (!*_page) + goto out_retry; + + if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC) + goto out; + + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + +limit_check: + /* user task is limited by task_reliable_limit */ + if (!reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + if (mem_reliable_should_reclaim() && retry_times--) { + nr_pages = mem_reliable_direct_reclaim(1 << order, ac); + if (nr_pages) + goto limit_check; + } + + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (is_global_init(current)) { + *gfp &= ~GFP_RELIABLE; + return true; + } + + if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask); +out: + return false; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -5194,6 +5277,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, prepare_before_alloc(&gfp); +retry: alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) @@ -5239,6 +5323,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = NULL; } + if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; -- Gitee From 5f0b48de238408677abf4f88d7f804314d6ebe7d Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:35 +0800 Subject: [PATCH 16/27] mm: Introduce fallback mechanism for memory reliable hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Introduce fallback mechanism for memory reliable. memory allocation will fallback to non-mirrored region if zone's low watermark is reached and kswapd will be awakened at this time. This mechanism is enabled by defalut and can be disabled by adding "reliable_debug=F" to the kernel parameters. This mechanism rely on CONFIG_MEMORY_RELIABLE and need "kernelcore=reliable" in the kernel parameters. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- .../admin-guide/kernel-parameters.txt | 4 ++- include/linux/mem_reliable.h | 7 +++++ mm/mem_reliable.c | 5 ++++ mm/page_alloc.c | 26 ++++++++++++++++++- 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 15c7bceb268c..fe9f3fc856ea 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4786,9 +4786,11 @@ See Documentation/admin-guide/cgroup-v1/cpusets.rst. reliable_debug= [ARM64] - Format: [P] + Format: [F][,P] Only works with CONFIG_MEMORY_RELIABLE and "kernelcore=reliable" is configured. + F: User memory allocation(special user task, tmpfs) will + not allocate memory from non-mirrored region if failed. P: Page cache does not use the reliable memory. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 9b94154b383e..1cc4a9460bcf 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -16,6 +16,7 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; +extern bool reliable_allow_fallback; extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; @@ -104,6 +105,11 @@ static inline bool mem_reliable_should_reclaim(void) return false; } + +static inline bool reliable_allow_fb_enabled(void) +{ + return reliable_allow_fallback; +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -138,6 +144,7 @@ static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) {} +static inline bool reliable_allow_fb_enabled(void) { return false; } #endif #endif diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 636e51a261ee..5fdda01c708f 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -16,6 +16,7 @@ EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; bool shmem_reliable __read_mostly = true; +bool reliable_allow_fallback __read_mostly = true; bool pagecache_use_reliable_mem __read_mostly = true; struct percpu_counter pagecache_reliable_pages; struct percpu_counter anon_reliable_pages; @@ -294,6 +295,10 @@ static int __init setup_reliable_debug(char *str) */ for (; *str && *str != ','; str++) { switch (*str) { + case 'F': + reliable_allow_fallback = false; + pr_info("disable memory reliable fallback\n"); + break; case 'P': pagecache_use_reliable_mem = false; pr_info("disable page cache use reliable memory\n"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 45ade2b73e1a..a8abfe6458a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4668,6 +4668,28 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +#ifdef CONFIG_MEMORY_RELIABLE +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return; + + if (gfp_mask & __GFP_NOFAIL) + return; + + if ((ac->highest_zoneidx == ZONE_NORMAL) && (gfp_mask & GFP_RELIABLE)) { + ac->highest_zoneidx = gfp_zone(gfp_mask & ~GFP_RELIABLE); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->highest_zoneidx, ac->nodemask); + return; + } +} +#else +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) {} +#endif + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4719,6 +4741,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); + mem_reliable_fallback_slowpath(gfp_mask, ac); + /* * The adjusted alloc_flags might result in immediate success, so try * that first @@ -5232,7 +5256,7 @@ static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, *_page = NULL; out_retry: - if (is_global_init(current)) { + if (reliable_allow_fb_enabled() || is_global_init(current)) { *gfp &= ~GFP_RELIABLE; return true; } -- Gitee From 32be46d7e9d4a83468b5de1d40b3e6540fc628cd Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Fri, 11 Nov 2022 09:32:36 +0800 Subject: [PATCH 17/27] shmem: Count and show reliable shmem info hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Count reliable shmem usage based on NR_SHMEM. Add ReliableShmem in /proc/meminfo to show reliable memory info used by shmem. - ReliableShmem: reliable memory used by shmem Signed-off-by: Zhou Guanghui Reviewed-by: Kefeng Wang --- Documentation/filesystems/proc.rst | 3 +++ include/linux/mem_reliable.h | 9 +++++++++ mm/filemap.c | 9 +++++++-- mm/khugepaged.c | 6 +++++- mm/mem_reliable.c | 13 ++++++++++++- mm/migrate.c | 5 +++++ mm/shmem.c | 2 ++ 7 files changed, 43 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 6ae531ee4de9..a57d96cf4644 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -972,6 +972,7 @@ varies by architecture and compile options. The following is from a ReliableTotal: 7340032 kB ReliableUsed: 418824 kB ReliableBuddyMem: 418824 kB + ReliableShmem: 96 kB MemTotal Total usable RAM (i.e. physical RAM minus a few reserved @@ -1107,6 +1108,8 @@ ReliableUsed The used amount of reliable memory ReliableBuddyMem Size of unused mirrored memory in buddy system +ReliableShmem + Total reliable memory used by share memory vmallocinfo ~~~~~~~~~~~ diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 1cc4a9460bcf..453b3237e305 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -16,6 +16,7 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; +extern struct percpu_counter reliable_shmem_used_nr_page; extern bool reliable_allow_fallback; extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; @@ -81,6 +82,12 @@ static inline bool page_reliable(struct page *page) return page_zonenum(page) < ZONE_MOVABLE; } +static inline void shmem_reliable_page_counter(struct page *page, int nr_page) +{ + if (shmem_reliable_is_enabled() && page_reliable(page)) + percpu_counter_add(&reliable_shmem_used_nr_page, nr_page); +} + static inline u64 task_reliable_used_pages(void) { s64 nr_pages; @@ -126,6 +133,8 @@ static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) } static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } +static inline void shmem_reliable_page_counter(struct page *page, + int nr_page) {} static inline void page_cache_prepare_alloc(gfp_t *gfp) {} static inline bool mem_reliable_status(void) { return false; } static inline bool page_reliable(struct page *page) { return false; } diff --git a/mm/filemap.c b/mm/filemap.c index 4f9cd18f9197..6480600cf0ea 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -192,6 +192,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); + shmem_reliable_page_counter(page, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -800,10 +801,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) + if (PageSwapBacked(old)) { __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) + shmem_reliable_page_counter(old, -1); + } + if (PageSwapBacked(new)) { __inc_lruvec_page_state(new, NR_SHMEM); + shmem_reliable_page_counter(new, 1); + } xas_unlock_irqrestore(&xas, flags); if (freepage) freepage(old); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 254211e56153..c1346c933586 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1910,6 +1910,8 @@ static void collapse_file(struct mm_struct *mm, ClearPageActive(page); ClearPageUnevictable(page); unlock_page(page); + if (is_shmem) + shmem_reliable_page_counter(page, -1); put_page(page); index++; } @@ -1920,8 +1922,10 @@ static void collapse_file(struct mm_struct *mm, SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - if (is_shmem) + if (is_shmem) { set_page_dirty(new_page); + shmem_reliable_page_counter(new_page, 1 << HPAGE_PMD_ORDER); + } lru_cache_add(new_page); /* diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 5fdda01c708f..f9458a805952 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -16,6 +16,7 @@ EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; bool shmem_reliable __read_mostly = true; +struct percpu_counter reliable_shmem_used_nr_page; bool reliable_allow_fallback __read_mostly = true; bool pagecache_use_reliable_mem __read_mostly = true; struct percpu_counter pagecache_reliable_pages; @@ -147,8 +148,12 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, void shmem_reliable_init(void) { - if (!mem_reliable_is_enabled() || !shmem_reliable_is_enabled()) + if (!mem_reliable_is_enabled() || !shmem_reliable_is_enabled()) { shmem_reliable = false; + return; + } + + percpu_counter_init(&reliable_shmem_used_nr_page, 0, GFP_KERNEL); } static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) @@ -166,6 +171,12 @@ void reliable_report_meminfo(struct seq_file *m) show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); + if (shmem_reliable_is_enabled()) { + unsigned long shmem_pages = (unsigned long)percpu_counter_sum( + &reliable_shmem_used_nr_page); + show_val_kb(m, "ReliableShmem: ", shmem_pages); + } + if (pagecache_reliable_is_enabled()) { s64 nr_pagecache_pages = 0; unsigned long num = 0; diff --git a/mm/migrate.c b/mm/migrate.c index ebbc34d7c509..94210ddd3f2a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -481,6 +481,11 @@ int migrate_page_move_mapping(struct address_space *mapping, xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (PageSwapBacked(page) && !PageSwapCache(page)) { + shmem_reliable_page_counter(page, -nr); + shmem_reliable_page_counter(newpage, nr); + } + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be diff --git a/mm/shmem.c b/mm/shmem.c index ad2d68150ed2..626f5510b319 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -752,6 +752,7 @@ static int shmem_add_to_page_cache(struct page *page, mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SHMEM, nr); + shmem_reliable_page_counter(page, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -784,6 +785,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) mapping->nrpages--; __dec_lruvec_page_state(page, NR_FILE_PAGES); __dec_lruvec_page_state(page, NR_SHMEM); + shmem_reliable_page_counter(page, -1); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); -- Gitee From f30c7817f982dac4fa6ea2f56601a174d1ed5e9b Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:37 +0800 Subject: [PATCH 18/27] mm: Introduce shmem mirrored memory limit for memory reliable hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- This limit is used to restrict the amount of mirrored memory by shmem. This memory allocation will return no memory if reliable fallback is off or fallback to non-mirrored region if reliable fallback on. This limit can be set or access via /proc/sys/vm/shmem_reliable_bytes_limit. The default value of this limit is LONG_MAX. This limit can be set from 0 to the total size of mirrored memory. Signed-off-by: Ma Wupeng Signed-off-by: Zhou Guanghui Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 8 ++++++++ mm/mem_reliable.c | 35 +++++++++++++++++++++++++++++++++++ mm/shmem.c | 18 ++++++++++++++---- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 453b3237e305..a041098c2158 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -17,6 +17,7 @@ DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; extern struct percpu_counter reliable_shmem_used_nr_page; +extern long shmem_reliable_nr_page __read_mostly; extern bool reliable_allow_fallback; extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; @@ -88,6 +89,12 @@ static inline void shmem_reliable_page_counter(struct page *page, int nr_page) percpu_counter_add(&reliable_shmem_used_nr_page, nr_page); } +static inline bool mem_reliable_shmem_limit_check(void) +{ + return percpu_counter_read_positive(&reliable_shmem_used_nr_page) < + shmem_reliable_nr_page; +} + static inline u64 task_reliable_used_pages(void) { s64 nr_pages; @@ -135,6 +142,7 @@ static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } static inline void shmem_reliable_page_counter(struct page *page, int nr_page) {} +static inline bool mem_reliable_shmem_limit_check(void) { return true; } static inline void page_cache_prepare_alloc(gfp_t *gfp) {} static inline bool mem_reliable_status(void) { return false; } static inline bool page_reliable(struct page *page) { return false; } diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index f9458a805952..dc9484f43838 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -24,6 +24,7 @@ struct percpu_counter anon_reliable_pages; static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; /* reliable user limit for user tasks with reliable flag */ unsigned long task_reliable_limit = ULONG_MAX; +long shmem_reliable_nr_page = ULONG_MAX >> PAGE_SHIFT; bool mem_reliable_counter_initialized(void) { @@ -231,6 +232,31 @@ static int reliable_pagecache_max_bytes_write(struct ctl_table *table, return ret; } +#ifdef CONFIG_SHMEM +static unsigned long sysctl_shmem_reliable_bytes_limit = ULONG_MAX; + +static int reliable_shmem_bytes_limit_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos) +{ + unsigned long *data_ptr = (unsigned long *)(table->data); + unsigned long old = *data_ptr; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (*data_ptr > PAGES_TO_B(total_reliable_pages())) { + *data_ptr = old; + return -EINVAL; + } + + shmem_reliable_nr_page = *data_ptr >> PAGE_SHIFT; + } + + return ret; +} +#endif + static struct ctl_table reliable_ctl_table[] = { { .procname = "reliable_pagecache_max_bytes", @@ -246,6 +272,15 @@ static struct ctl_table reliable_ctl_table[] = { .mode = 0644, .proc_handler = reliable_limit_handler, }, +#ifdef CONFIG_SHMEM + { + .procname = "shmem_reliable_bytes_limit", + .data = &sysctl_shmem_reliable_bytes_limit, + .maxlen = sizeof(sysctl_shmem_reliable_bytes_limit), + .mode = 0644, + .proc_handler = reliable_shmem_bytes_limit_handler, + }, +#endif {} }; diff --git a/mm/shmem.c b/mm/shmem.c index 626f5510b319..fbddc7dfb72e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1561,12 +1561,20 @@ static struct page *shmem_alloc_page(gfp_t gfp, return page; } -static inline void shmem_prepare_alloc(gfp_t *gfp_mask) +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { if (!shmem_reliable_is_enabled()) - return; + return true; + + if (mem_reliable_shmem_limit_check()) { + *gfp_mask |= GFP_RELIABLE; + return true; + } + + if (reliable_allow_fb_enabled()) + return true; - *gfp_mask |= GFP_RELIABLE; + return false; } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, @@ -1585,7 +1593,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (!shmem_inode_acct_block(inode, nr)) goto failed; - shmem_prepare_alloc(&gfp); + if (!shmem_prepare_alloc(&gfp)) + goto no_mem; if (huge) page = shmem_alloc_hugepage(gfp, info, index, node_id); @@ -1597,6 +1606,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, return page; } +no_mem: err = -ENOMEM; shmem_inode_unacct_blocks(inode, nr); failed: -- Gitee From 3f5ebb1c887e44ca6a5574fefa8522b84790913c Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:38 +0800 Subject: [PATCH 19/27] mm: Introduce reliable_debug=S to control shmem use mirrored memory hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Introduce reliable_debug=S to control shmem use mirrored memory. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- mm/mem_reliable.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fe9f3fc856ea..71d45c34858f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4786,11 +4786,12 @@ See Documentation/admin-guide/cgroup-v1/cpusets.rst. reliable_debug= [ARM64] - Format: [F][,P] + Format: [F][,S][,P] Only works with CONFIG_MEMORY_RELIABLE and "kernelcore=reliable" is configured. F: User memory allocation(special user task, tmpfs) will not allocate memory from non-mirrored region if failed. + S: The shmem does not use the reliable memory. P: Page cache does not use the reliable memory. reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index dc9484f43838..cc4e2886b8cf 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -345,6 +345,10 @@ static int __init setup_reliable_debug(char *str) reliable_allow_fallback = false; pr_info("disable memory reliable fallback\n"); break; + case 'S': + shmem_reliable = false; + pr_info("disable shmem use reliable memory\n"); + break; case 'P': pagecache_use_reliable_mem = false; pr_info("disable page cache use reliable memory\n"); -- Gitee From 2525d04c6308006b789564ed955081602727f38f Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:39 +0800 Subject: [PATCH 20/27] mm: Introduce proc interface to disable memory reliable features hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- reliable_debug is used to disable memory reliable features. Four bits are used to represent the following features - bit 0: memory reliable feature - bit 1: reliable fallback feature - bit 2: tmpfs use reliable memory feature - bit 3: pagecache use reliable memory feature Bit 1~3 are valid if and only if the bit 0 is 1. If the first bit is 0, all other features will be closed no matter other bits's status. For example, you can disable all features by $ echo 0 > /proc/sys/vm/reliable_debug Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/mem_reliable.c | 129 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 6 deletions(-) diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index cc4e2886b8cf..f3738f37d802 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -11,6 +11,14 @@ #define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) +enum mem_reliable_types { + MEM_RELIABLE_ALL, + MEM_RELIABLE_FALLBACK, + MEM_RELIABLE_SHMEM, + MEM_RELIABLE_PAGECACHE, + MEM_RELIABLE_MAX +}; + DEFINE_STATIC_KEY_FALSE(mem_reliable); EXPORT_SYMBOL_GPL(mem_reliable); @@ -193,6 +201,7 @@ void reliable_report_meminfo(struct seq_file *m) } } +#ifdef CONFIG_SYSCTL static int reliable_limit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -232,6 +241,81 @@ static int reliable_pagecache_max_bytes_write(struct ctl_table *table, return ret; } +static void mem_reliable_feature_disable(int idx); + +#define CTRL_BITS_SHIFT MEM_RELIABLE_MAX +#define CTRL_BITS_MASK ((1 << CTRL_BITS_SHIFT) - 1) + +static unsigned long mem_reliable_ctrl_bits = CTRL_BITS_MASK; + +static void mem_reliable_ctrl_bit_disable(int idx) +{ + clear_bit(idx, &mem_reliable_ctrl_bits); +} + +static bool mem_reliable_ctrl_bit_is_enabled(int idx) +{ + return !!test_bit(idx, &mem_reliable_ctrl_bits); +} + +static void mem_reliable_parse_ctrl_bits(unsigned long ctrl_bits) +{ + bool status; + int i; + + for (i = MEM_RELIABLE_FALLBACK; i < MEM_RELIABLE_MAX; i++) { + status = !!test_bit(i, &ctrl_bits); + + if (mem_reliable_ctrl_bit_is_enabled(i) && !status) + mem_reliable_feature_disable(i); + } +} + +static void mem_reliable_disable_all(void) +{ + mem_reliable_ctrl_bits = 0; + + reliable_allow_fallback = false; + shmem_reliable = false; + pagecache_use_reliable_mem = false; + static_branch_disable(&mem_reliable); + + pr_info("memory reliable feature disabled.\n"); +} + +static int reliable_debug_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + unsigned long old_ctrl_bits, new_ctrl_bits; + static DEFINE_MUTEX(reliable_debug_mutex); + int ret; + + mutex_lock(&reliable_debug_mutex); + old_ctrl_bits = mem_reliable_ctrl_bits; + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (!mem_reliable_is_enabled() || + (mem_reliable_ctrl_bits > (1 << CTRL_BITS_SHIFT) - 1)) { + mem_reliable_ctrl_bits = old_ctrl_bits; + mutex_unlock(&reliable_debug_mutex); + + return -EINVAL; + } + + new_ctrl_bits = mem_reliable_ctrl_bits; + mem_reliable_ctrl_bits = old_ctrl_bits; + if (!!test_bit(MEM_RELIABLE_ALL, &new_ctrl_bits)) + mem_reliable_parse_ctrl_bits(new_ctrl_bits); + else + mem_reliable_disable_all(); + } + + mutex_unlock(&reliable_debug_mutex); + + return ret; +} + #ifdef CONFIG_SHMEM static unsigned long sysctl_shmem_reliable_bytes_limit = ULONG_MAX; @@ -281,6 +365,13 @@ static struct ctl_table reliable_ctl_table[] = { .proc_handler = reliable_shmem_bytes_limit_handler, }, #endif + { + .procname = "reliable_debug", + .data = &mem_reliable_ctrl_bits, + .maxlen = sizeof(mem_reliable_ctrl_bits), + .mode = 0600, + .proc_handler = reliable_debug_handler, + }, {} }; @@ -310,6 +401,35 @@ static int __init reliable_sysctl_init(void) return 0; } arch_initcall(reliable_sysctl_init); +#else +static void mem_reliable_ctrl_bit_disabled(int idx) {} +#endif + +static void mem_reliable_feature_disable(int idx) +{ + char *str = NULL; + + switch (idx) { + case MEM_RELIABLE_FALLBACK: + reliable_allow_fallback = false; + str = "fallback"; + break; + case MEM_RELIABLE_SHMEM: + shmem_reliable = false; + str = "shmem"; + break; + case MEM_RELIABLE_PAGECACHE: + pagecache_use_reliable_mem = false; + str = "pagecache"; + break; + default: + pr_err("unknown index: %d", idx); + return; + } + + mem_reliable_ctrl_bit_disable(idx); + pr_info("%s is disabled\n", str); +} void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) @@ -342,16 +462,13 @@ static int __init setup_reliable_debug(char *str) for (; *str && *str != ','; str++) { switch (*str) { case 'F': - reliable_allow_fallback = false; - pr_info("disable memory reliable fallback\n"); + mem_reliable_feature_disable(MEM_RELIABLE_FALLBACK); break; case 'S': - shmem_reliable = false; - pr_info("disable shmem use reliable memory\n"); + mem_reliable_feature_disable(MEM_RELIABLE_SHMEM); break; case 'P': - pagecache_use_reliable_mem = false; - pr_info("disable page cache use reliable memory\n"); + mem_reliable_feature_disable(MEM_RELIABLE_PAGECACHE); break; default: pr_err("reliable_debug option '%c' unknown. skipped\n", -- Gitee From bfdc680cdca351baa803766c2a93ac74c9e76f64 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:40 +0800 Subject: [PATCH 21/27] mm: Show debug info about memory reliable if oom occurs hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Show debug info about memory reliable if oom occurs. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- include/linux/mem_reliable.h | 2 ++ lib/show_mem.c | 1 + mm/mem_reliable.c | 38 ++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index a041098c2158..5e14980d5793 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -38,6 +38,7 @@ extern void reliable_lru_add_batch(int zid, enum lru_list lru, extern bool mem_reliable_counter_initialized(void); extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); +extern void reliable_show_mem_info(void); static inline bool mem_reliable_is_enabled(void) { @@ -162,6 +163,7 @@ static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, int preferred_nid, nodemask_t *nodemask) {} static inline bool reliable_allow_fb_enabled(void) { return false; } +static inline void reliable_show_mem_info(void) {} #endif #endif diff --git a/lib/show_mem.c b/lib/show_mem.c index 1c26c14ffbb9..11751aebc98f 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -41,4 +41,5 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + reliable_show_mem_info(); } diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index f3738f37d802..125f6ef5587b 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -431,6 +431,44 @@ static void mem_reliable_feature_disable(int idx) pr_info("%s is disabled\n", str); } +void reliable_show_mem_info(void) +{ + if (!mem_reliable_is_enabled()) + return; + + pr_info("ReliableTotal: %lu kB\n", total_reliable_pages() + << (PAGE_SHIFT - 10)); + pr_info("ReliableUsed: %lu kB\n", used_reliable_pages() + << (PAGE_SHIFT - 10)); + pr_info("ReliableTaskLimit: %lu kB\n", task_reliable_limit >> 10); + pr_info("ReliableTaskUsed: %lld kB\n", task_reliable_used_pages() + << (PAGE_SHIFT - 10)); + + if (shmem_reliable_is_enabled()) { + pr_info("ReliableShmemPagesLimit: %ld\n", + shmem_reliable_nr_page); + pr_info("ReliableShmem: %llu kB\n", + percpu_counter_sum(&reliable_shmem_used_nr_page) + << (PAGE_SHIFT - 10)); + } + + if (pagecache_reliable_is_enabled()) { + s64 nr_pagecache_pages = 0; + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + pr_info("ReliableFileCacheLimit: %lu kB\n", + reliable_pagecache_max_bytes >> 10); + pr_info("FileCache: %lu kB\n", num << (PAGE_SHIFT - 10)); + + nr_pagecache_pages = + percpu_counter_sum_positive(&pagecache_reliable_pages); + pr_info("ReliableFileCache: %llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); + } +} + void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { -- Gitee From cb562ce3248357a04d67b02358b9ed73783baa6e Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 11 Nov 2022 09:32:41 +0800 Subject: [PATCH 22/27] mm: Add reliable_nr_page for accounting reliable memory hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Adding an variable in mm_struct for accouting the amount of reliable memory allocated by the reliable user tasks. Signed-off-by: Peng Wu Reviewed-by: Kefeng Wang --- include/linux/mm_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c22e294f083..15ff1e20f5ca 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -587,7 +587,8 @@ struct mm_struct { #endif #ifdef CONFIG_MEMORY_RELIABLE - atomic_long_t reserve_0; + /* total used reliable pages */ + KABI_RENAME(atomic_long_t reserve_0, atomic_long_t reliable_nr_page); #endif } __randomize_layout; -- Gitee From d81e9624de2104c92417bfe2e72a374f6d864ba0 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Fri, 11 Nov 2022 09:32:42 +0800 Subject: [PATCH 23/27] proc: Count reliable memory usage of reliable tasks hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Counting reliable memory allocated by the reliable user tasks. The policy of counting reliable memory usage is based on RSS statistics. Anywhere with counter of mm need count reliable pages too. Reliable page which is checked by page_reliable() need to update the reliable page counter by calling reliable_page_counter(). Updating the reliable pages should be considered if the following logic is added: - add_mm_counter - dec_mm_counter - inc_mm_counter_fast - dec_mm_counter_fast - rss[mm_counter(page)] Signed-off-by: Peng Wu Reviewed-by: Kefeng Wang --- Documentation/filesystems/proc.rst | 2 ++ fs/proc/task_mmu.c | 1 + include/linux/mem_reliable.h | 14 ++++++++++++++ kernel/events/uprobes.c | 2 ++ mm/huge_memory.c | 8 ++++++++ mm/khugepaged.c | 4 ++++ mm/ksm.c | 2 ++ mm/mem_reliable.c | 13 +++++++++++++ mm/memory.c | 12 ++++++++++++ mm/migrate.c | 5 +++++ mm/mmap.c | 1 + mm/rmap.c | 5 +++++ mm/shmem.c | 1 + mm/swapfile.c | 2 ++ mm/userfaultfd.c | 1 + 15 files changed, 73 insertions(+) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index a57d96cf4644..2fa2f7cd1287 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -195,6 +195,7 @@ read the file /proc/PID/status:: VmPTE: 20 kb VmSwap: 0 kB HugetlbPages: 0 kB + Reliable: 1608 kB CoreDumping: 0 THP_enabled: 1 Threads: 1 @@ -275,6 +276,7 @@ It's slow but very precise. VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions + Reliable size of reliable memory used CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) THP_enabled process is allowed to use THP (returns 0 when diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c61a3fbbfd71..f96d081b881a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -77,6 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); + reliable_report_usage(m, mm); } #undef SEQ_PUT_DEC diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 5e14980d5793..ddadf2803742 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -23,6 +23,7 @@ extern bool pagecache_use_reliable_mem; extern struct percpu_counter pagecache_reliable_pages; extern struct percpu_counter anon_reliable_pages; extern unsigned long task_reliable_limit __read_mostly; +extern atomic_long_t reliable_user_used_nr_page; extern void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, @@ -39,6 +40,8 @@ extern bool mem_reliable_counter_initialized(void); extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); extern void reliable_show_mem_info(void); +extern void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm); static inline bool mem_reliable_is_enabled(void) { @@ -125,6 +128,13 @@ static inline bool reliable_allow_fb_enabled(void) { return reliable_allow_fallback; } + +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) +{ + if (page_reliable(page)) + atomic_long_add(val, &mm->reliable_nr_page); +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -164,6 +174,10 @@ static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, nodemask_t *nodemask) {} static inline bool reliable_allow_fb_enabled(void) { return false; } static inline void reliable_show_mem_info(void) {} +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) {} +static inline void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm) {} #endif #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e1bbb3b92921..ad6664fcc3b2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -183,6 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); + reliable_page_counter(new_page, mm, 1); page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_inactive_or_unevictable(new_page, vma); } else @@ -194,6 +195,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } + reliable_page_counter(old_page, mm, -1); flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 012ecf107ae0..e6f1903f6f63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -652,6 +652,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); @@ -1115,6 +1116,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(src_page, dst_mm, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -1696,6 +1698,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); + reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2077,6 +2080,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); + reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); put_page(page); } @@ -2212,6 +2216,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { + reliable_page_counter(page + i, mm, -1); page_remove_rmap(page + i, false); put_page(page + i); } @@ -3004,6 +3009,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); + reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); put_page(page); } @@ -3031,6 +3037,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); + reliable_page_counter(new, mm, HPAGE_PMD_NR); if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else @@ -3087,6 +3094,7 @@ vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long add pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); set_pmd_at(vma->vm_mm, address, pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(ptl); count_vm_event(THP_FAULT_ALLOC); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c1346c933586..aaef16aa8945 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -748,6 +748,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + reliable_page_counter(page, vma->vm_mm, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* * ptl mostly unnecessary. @@ -776,6 +777,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); + reliable_page_counter(src_page, vma->vm_mm, -1); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@ -1202,6 +1204,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); + reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); page_add_new_anon_rmap(new_page, vma, address, true); lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1509,6 +1512,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); } diff --git a/mm/ksm.c b/mm/ksm.c index 582c02058baf..169c0da1a9db 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1155,6 +1155,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); + reliable_page_counter(kpage, mm, 1); page_add_anon_rmap(kpage, vma, addr, false); newpte = mk_pte(kpage, vma->vm_page_prot); } else { @@ -1179,6 +1180,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index 125f6ef5587b..28baba72bb5e 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -33,6 +33,7 @@ static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; /* reliable user limit for user tasks with reliable flag */ unsigned long task_reliable_limit = ULONG_MAX; long shmem_reliable_nr_page = ULONG_MAX >> PAGE_SHIFT; +atomic_long_t reliable_user_used_nr_page; bool mem_reliable_counter_initialized(void) { @@ -178,6 +179,7 @@ void reliable_report_meminfo(struct seq_file *m) show_val_kb(m, "ReliableTotal: ", total_reliable_pages()); show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); + show_val_kb(m, "ReliableTaskUsed: ", task_reliable_used_pages()); show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); if (shmem_reliable_is_enabled()) { @@ -518,3 +520,14 @@ static int __init setup_reliable_debug(char *str) return 1; } __setup("reliable_debug", setup_reliable_debug); + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) +void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + if (!mem_reliable_is_enabled()) + return; + + SEQ_PUT_DEC("Reliable:\t", atomic_long_read(&mm->reliable_nr_page)); + seq_puts(m, "kB\n"); +} diff --git a/mm/memory.c b/mm/memory.c index dbb0fb9bcf81..767888f0ef04 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -834,6 +834,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); + reliable_page_counter(new_page, dst_vma->vm_mm, 1); page_add_new_anon_rmap(new_page, dst_vma, addr, false); lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; @@ -1273,6 +1274,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1300,6 +1302,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + reliable_page_counter(page, mm, -1); rss[mm_counter(page)]--; page_remove_rmap(page, false); put_page(page); @@ -1664,6 +1667,7 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, 1); page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2942,9 +2946,12 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } + reliable_page_counter(old_page, mm, -1); } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } + + reliable_page_counter(new_page, mm, 1); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); @@ -3528,6 +3535,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { @@ -3696,6 +3704,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -3890,6 +3899,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held @@ -3962,6 +3972,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ + reliable_page_counter(page, vma->vm_mm, 1); if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); @@ -5443,6 +5454,7 @@ vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, if (ret) goto release; inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, address, false); lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 94210ddd3f2a..3f6c76b97989 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -269,6 +269,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, { set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + reliable_page_counter(new, vma->vm_mm, 1); if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else @@ -2212,6 +2213,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ + reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); page_add_anon_rmap(new_page, vma, start, true); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB @@ -2229,6 +2231,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_ref_unfreeze(page, 2); mlock_migrate_page(new_page, page); + reliable_page_counter(page, vma->vm_mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); @@ -2473,6 +2476,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * drop page refcount. Page won't be freed, as we took * a reference just above. */ + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); put_page(page); @@ -2967,6 +2971,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/mmap.c b/mm/mmap.c index 7fba5d89ecde..624586bef419 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1747,6 +1747,7 @@ do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); put_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index d0ef90af0567..3e12d26d8c55 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1594,6 +1594,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1609,6 +1610,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1688,6 +1690,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); goto discard; } @@ -1721,6 +1724,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) @@ -1743,6 +1747,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, -1); } discard: /* diff --git a/mm/shmem.c b/mm/shmem.c index fbddc7dfb72e..e85ac8c2150f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2467,6 +2467,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, spin_unlock_irq(&info->lock); inc_mm_counter(dst_mm, mm_counter_file(page)); + reliable_page_counter(page, dst_mm, 1); page_add_file_rmap(page, false); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/swapfile.c b/mm/swapfile.c index eaf483c7c83e..7faa30f460e4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1935,6 +1935,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); + + reliable_page_counter(page, vma->vm_mm, 1); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3849b28c0952..15c46208a2ac 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -150,6 +150,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, #endif inc_mm_counter(dst_mm, MM_ANONPAGES); + reliable_page_counter(page, dst_mm, 1); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); lru_cache_add_inactive_or_unevictable(page, dst_vma); -- Gitee From b9b3aaade9ce3bb1abe795e3a82ad8a0a8359e45 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:43 +0800 Subject: [PATCH 24/27] mm: Update reliable flag in memory allocaion for reliable task only in task context hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Since interrupt may occupy reliable task's context and its current->flags will have PF_RELIABLE and this will lead to redirect it's memory allocation to mirrored region. In order to solve this problem, update reliable task's gfp flag can only happen in normal task context by checking in_task(). Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8abfe6458a5..2feb99a0b98f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5186,6 +5186,9 @@ static inline void prepare_before_alloc(gfp_t *gfp_mask) if (!zone_movable) goto clear_flag; + if (!in_task()) + return; + if ((current->flags & PF_RELIABLE) || is_global_init(current)) *gfp_mask |= GFP_RELIABLE; -- Gitee From 8525dfb2d97677baa0781233a029a52c67ccb723 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:44 +0800 Subject: [PATCH 25/27] mm/memblock: Introduce ability to alloc memory from specify memory reigon hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- With mirrored feature enabled, memblock will prefer to alloc memory from mirrored memory in any case. Since mirrored region and non-mirrored region may have different capacity or bandwidth, memblock user may choose which region to alloc memory rather than choose the mirrored one by default. To solve this problem, flag MEMBLOCK_NOMIRROR is introduced to alloc memory from non-mirrored region. Function memblock_alloc_range_nid_flags() is introduced to alloc memory with specify flag without fallback. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- include/linux/memblock.h | 5 ++ mm/memblock.c | 105 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b9b0239e34a..d6b6a93aa73e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -38,6 +38,7 @@ enum memblock_flags { MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_MEMMAP = 0x8, /* memmap reserved region */ + MEMBLOCK_NOMIRROR = 0x10, /* alloc from non-mirrored region */ }; /** @@ -410,6 +411,10 @@ void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); +void *memblock_alloc_try_nid_raw_flags(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid, + enum memblock_flags flags); static inline void * __init memblock_alloc(phys_addr_t size, phys_addr_t align) { diff --git a/mm/memblock.c b/mm/memblock.c index 53e92fc7ef6f..e1fd07166a35 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -968,6 +968,10 @@ static bool should_skip_region(struct memblock_type *type, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; + /* skip mirror memory regions with MEMBLOCK_NOMIRROR */ + if ((flags & MEMBLOCK_NOMIRROR) && memblock_is_mirror(m)) + return true; + /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; @@ -1386,6 +1390,74 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, return found; } +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes + * @flags: alloc memory with specify flag + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ +static phys_addr_t __init memblock_alloc_range_nid_flags( + phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, + int nid, bool exact_nid, enum memblock_flags flags) +{ + phys_addr_t found; + + if (WARN_ONCE( + nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); + align = SMP_CACHE_BYTES; + } + + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + + if (nid != NUMA_NO_NODE && !exact_nid) { + found = memblock_find_in_range_node(size, align, start, + end, NUMA_NO_NODE, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + } + + return 0; + +done: + /* Skip kmemleak for kasan_init() due to high volume. */ + if (end != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that memblock allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc_phys(found, size, 0, 0); + + return found; +} + /** * memblock_phys_alloc_range - allocate a memory block inside specified range * @size: size of memory block to be allocated in bytes @@ -1541,6 +1613,39 @@ void * __init memblock_alloc_try_nid_raw( false); } +void * __init memblock_alloc_try_nid_raw_flags( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid, enum memblock_flags flags) +{ + phys_addr_t alloc; + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + + alloc = memblock_alloc_range_nid_flags(size, align, min_addr, max_addr, + nid, false, flags); + + /* retry allocation without lower limit */ + if (!alloc && min_addr) + alloc = memblock_alloc_range_nid_flags(size, align, 0, max_addr, + nid, false, flags); + + if (!alloc) + return NULL; + + ptr = phys_to_virt(alloc); + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + /** * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes -- Gitee From 74bfdf157f1fd40b5d7966d1d94fcd690ad93a3e Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 11 Nov 2022 09:32:45 +0800 Subject: [PATCH 26/27] mm/hugetlb: Hugetlb use non-mirrored memory if memory reliable is enabled hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- Previous memory allocation in memblock for hugetlb may use mirrored or non-mirrored memory depends on the system's memory status. However this is not suitable if hugetlb user want to alloc memory from non-mirrored memory if memory reliable is enabled. In order to solve this problem, hugetlb use MEMBLOCK_NOMIRROR flag to alloc memory from non-mirrored region without fallback to mirrored region. Signed-off-by: Ma Wupeng Reviewed-by: Kefeng Wang --- mm/hugetlb.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f5f8227b090b..d687f271901e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2697,6 +2697,20 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } +static void *__init __alloc_bootmem_huge_page_inner(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, + int nid) +{ + if (!mem_reliable_is_enabled()) + return memblock_alloc_try_nid_raw(size, align, max_addr, + max_addr, nid); + + return memblock_alloc_try_nid_raw_flags(size, align, max_addr, max_addr, + nid, MEMBLOCK_NOMIRROR); +} + int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) @@ -2712,7 +2726,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = __alloc_bootmem_huge_page_inner(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; @@ -2720,7 +2734,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - m = memblock_alloc_try_nid_raw( + m = __alloc_bootmem_huge_page_inner( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); /* -- Gitee From 426b9efea56f9b065151d165caab2ebd32906505 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Fri, 11 Nov 2022 09:32:46 +0800 Subject: [PATCH 27/27] mm: Add sysctl to clear free list pages hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- This patch add sysctl to clear pages in free lists of each NUMA node. For each NUMA node, clear each page in the free list, these work is scheduled on a random CPU of the NUMA node. When kasan is enabled and the pages are free, the shadow memory will be filled with 0xFF, writing these free pages will cause UAF, so just disable KASAN for clear freelist. In the case of large memory, the clear freelist will hold zone lock for a long time. As a result, the process may be blocked unless clear freelist thread exit, and causing the system to be reset by the watchdog. Provide a mechanism to stop clear freelist threads when elapsed time exceeds cfp_timeout, which can be set by module_param(). Signed-off-by: Yu Liao Reviewed-by: Kefeng Wang --- .../admin-guide/kernel-parameters.txt | 4 + Documentation/admin-guide/sysctl/vm.rst | 13 ++ mm/Kconfig | 13 ++ mm/Makefile | 2 + mm/clear_freelist_page.c | 187 ++++++++++++++++++ 5 files changed, 219 insertions(+) create mode 100644 mm/clear_freelist_page.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 71d45c34858f..0ec4c66752af 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -539,6 +539,10 @@ cio_ignore= [S390] See Documentation/s390/common_io.rst for details. + + clear_freelist + Enable clear_freelist feature. + clk_ignore_unused [CLK] Prevents the clock framework from automatically gating diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index eb227015a895..a84bef7aa864 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,6 +25,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes +- clear_freelist_pages - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -109,6 +110,18 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. +clear_freelist_pages +==================== + +Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the +file, all pages in free lists will be written with 0. + +Zone lock is held during clear_freelist_pages, if the execution time is too +long, RCU CPU Stall warnings will be print. For each NUMA node, +clear_freelist_pages is performed on a "random" CPU of the NUMA node. +The time consuming is related to the hardware. + + compact_memory ============== diff --git a/mm/Kconfig b/mm/Kconfig index 5e1175da720e..be7fd4ed2c4f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -980,6 +980,19 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters. +config CLEAR_FREELIST_PAGE + bool "Support for clear free list pages" + depends on MMU && SYSCTL + default n + help + Say y here to enable the clear free list pages feature. When + writing to clear_freelist, trigger to clean up the free memory + of the buddy system. + + To enable this feature, kernel parameter "clear_freelist" also + needs to be added. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 9798d8735cc7..696ee59c2ac7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,6 +7,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n KCSAN_SANITIZE_kmemleak.o := n +KASAN_SANITIZE_clear_freelist_page.o := n # These produce frequent data race reports: most of them are due to races on # the same word but accesses to different bits of that word. Re-enable KCSAN @@ -130,3 +131,4 @@ obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o +obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c new file mode 100644 index 000000000000..50b7ec918bfb --- /dev/null +++ b/mm/clear_freelist_page.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for clear free list pages. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CFP_DEFAULT_TIMEOUT 2000 +#define for_each_populated_zone_pgdat(pgdat, zone) \ + for (zone = pgdat->node_zones; \ + zone; \ + zone = next_pgdat_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + +struct pgdat_entry { + struct pglist_data *pgdat; + struct work_struct work; +}; + +static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); +static DEFINE_MUTEX(clear_freelist_lock); +static atomic_t clear_freelist_workers; +static atomic_t clear_pages_num; +static ulong cfp_timeout_ms = CFP_DEFAULT_TIMEOUT; + +/* + * next_pgdat_zone - helper magic for for_each_populated_zone_pgdat() + */ +static struct zone *next_pgdat_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else + zone = NULL; + return zone; +} + +static void clear_pgdat_freelist_pages(struct work_struct *work) +{ + struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + u64 cfp_timeout_ns = cfp_timeout_ms * NSEC_PER_MSEC; + struct pglist_data *pgdat = entry->pgdat; + unsigned long flags, order, t; + struct page *page; + struct zone *zone; + u64 start, now; + + start = sched_clock(); + + for_each_populated_zone_pgdat(pgdat, zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { + now = sched_clock(); + if (unlikely(now - start > cfp_timeout_ns)) { + spin_unlock_irqrestore(&zone->lock, flags); + goto out; + } + +#ifdef CONFIG_KMAP_LOCAL + int i; + + /* Clear highmem by clear_highpage() */ + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +#else + memset(page_address(page), 0, (1 << order) * PAGE_SIZE); +#endif + touch_nmi_watchdog(); + atomic_add(1 << order, &clear_pages_num); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + cond_resched(); + } + +out: + kfree(entry); + + if (atomic_dec_and_test(&clear_freelist_workers)) + wake_up(&clear_freelist_wait); +} + +static void init_clear_freelist_work(struct pglist_data *pgdat) +{ + struct pgdat_entry *entry; + + entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL); + if (!entry) + return; + + entry->pgdat = pgdat; + INIT_WORK(&entry->work, clear_pgdat_freelist_pages); + queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work); +} + +static void clear_freelist_pages(void) +{ + struct pglist_data *pgdat; + + mutex_lock(&clear_freelist_lock); + drain_all_pages(NULL); + + for_each_online_pgdat(pgdat) { + atomic_inc(&clear_freelist_workers); + init_clear_freelist_work(pgdat); + } + + wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0); + + pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num), + global_zone_page_state(NR_FREE_PAGES)); + atomic_set(&clear_pages_num, 0); + + mutex_unlock(&clear_freelist_lock); +} + +static int sysctl_clear_freelist_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int val; + + table->data = &val; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + clear_freelist_pages(); + + return ret; +} + +static struct ctl_table clear_freelist_table[] = { + { + .procname = "clear_freelist_pages", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &sysctl_clear_freelist_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static struct ctl_table sys_ctl_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = clear_freelist_table, + }, + { } +}; + +static bool clear_freelist_enabled; +static int __init setup_clear_freelist(char *str) +{ + clear_freelist_enabled = true; + return 1; +} +__setup("clear_freelist", setup_clear_freelist); + +static int __init clear_freelist_init(void) +{ + if (clear_freelist_enabled) + register_sysctl_table(sys_ctl_table); + + return 0; +} +module_init(clear_freelist_init); +module_param(cfp_timeout_ms, ulong, 0644); -- Gitee