From d879715946a7c9c640f5824f9875bbf8025cec28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E6=9D=B0?= Date: Sun, 23 Feb 2025 16:08:35 +0800 Subject: [PATCH 1/2] dfx for memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 陈杰 --- drivers/of/of_reserved_mem.c | 90 +++++++++++++++++++++++++++ fs/proc/base.c | 4 ++ fs/proc/internal.h | 6 ++ include/linux/mm.h | 12 ++++ include/linux/mm_types.h | 3 + kernel/fork.c | 3 + mm/Kconfig | 25 ++++++++ mm/Makefile | 3 + mm/lmkd_dbg_trigger.c | 83 +++++++++++++++++++++++++ mm/lowmem_dbg.c | 110 ++++++++++++++++++++++++++++++++ mm/rss_threshold.c | 117 +++++++++++++++++++++++++++++++++++ 11 files changed, 456 insertions(+) create mode 100644 mm/lmkd_dbg_trigger.c create mode 100644 mm/lowmem_dbg.c create mode 100644 mm/rss_threshold.c diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 959f1808c240..47592a458af5 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -23,12 +23,19 @@ #include #include #include +#include +#include #include "of_private.h" #define MAX_RESERVED_REGIONS 64 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; static int reserved_mem_count; +#define DT_RESERVED_MEM "dt_reserved_mem" +static int dynamic_reserved_mem_count; +static const char *dynamic_reserved_mem_array[MAX_RESERVED_REGIONS]; +static int cma_reserved_mem_count; +static const char *cma_reserved_mem_array[MAX_RESERVED_REGIONS]; static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, bool nomap, @@ -73,6 +80,11 @@ void __init fdt_reserved_mem_save_node(unsigned long node, const char *uname, rmem->name = uname; rmem->base = base; rmem->size = size; + if ((of_get_flat_dt_prop(node, "reusable", NULL)) + && (cma_reserved_mem_count < MAX_RESERVED_REGIONS)) { + cma_reserved_mem_array[cma_reserved_mem_count] = uname; + cma_reserved_mem_count++; + } reserved_mem_count++; return; @@ -214,6 +226,11 @@ static int __init __reserved_mem_alloc_size(unsigned long node, return -ENOMEM; } + if (dynamic_reserved_mem_count < MAX_RESERVED_REGIONS) { + dynamic_reserved_mem_array[dynamic_reserved_mem_count] = uname; + dynamic_reserved_mem_count++; + } + *res_base = base; *res_size = size; @@ -514,3 +531,76 @@ struct reserved_mem *of_reserved_mem_lookup(struct device_node *np) return NULL; } EXPORT_SYMBOL_GPL(of_reserved_mem_lookup); + +static int dt_reserved_memory_debug_show(struct seq_file *m, void *private) +{ + struct reserved_mem *dt_reserved_mem = m->private; + struct reserved_mem *rmem = NULL; + int i = 0; + int j = 0; + int cma = 0; + int dynamic = 0; + + seq_printf(m, " num [start .... end] [size]" + " [d/s] [cma] [name]\n"); + + for (i = 0; i < reserved_mem_count; i++) { + cma = 0; + dynamic = 0; + rmem = &(dt_reserved_mem[i]); + + /* find out dynamic reserved memory node */ + for (j = 0; j < dynamic_reserved_mem_count; j++) { + if (!strcmp(rmem->name, dynamic_reserved_mem_array[j])) { + dynamic = 1; + break; + } + } + + /* find out cma reserved memory node */ + for (j = 0; j < cma_reserved_mem_count; j++) { + if (!strcmp(rmem->name, cma_reserved_mem_array[j])) { + cma = 1; + break; + } + } + + seq_printf(m, "%4d: [0x%016llx..0x%016llx] %8llukB %8s %8s %8s\n", + i, + (unsigned long long)rmem->base, + (unsigned long long)(rmem->base + rmem->size - 1), + (unsigned long long)rmem->size / SZ_1K, + (dynamic == 1) ? "d" : "s", + (cma == 1) ? "y" : "n", + rmem->name); + } + + return 0; +} + +static int dt_reserved_memory_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, dt_reserved_memory_debug_show, inode->i_private); +} + +static const struct file_operations dt_reserved_memory_debug_fops = { + .open = dt_reserved_memory_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init dt_reserved_memory_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir(DT_RESERVED_MEM, NULL); + if (!root) + return -ENXIO; + debugfs_create_file("dt_reserved_memory", + S_IRUGO, + root, + reserved_mem, + &dt_reserved_memory_debug_fops); + + return 0; +} +__initcall(dt_reserved_memory_init_debugfs); diff --git a/fs/proc/base.c b/fs/proc/base.c index 4f45750de1db..fc985a556821 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3509,6 +3509,10 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_TIME_NS REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), +#endif +#ifdef CONFIG_RSS_THRESHOLD + ONE("rss", S_IRUGO, proc_pid_rss), + REG("rss_threshold", S_IRUGO|S_IWUSR, proc_pid_rss_threshold_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..5599e5ef0393 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -173,6 +173,12 @@ extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); +#ifdef CONFIG_RSS_THRESHOLD +extern int proc_pid_rss(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern void listen_rss_threshold(struct mm_struct *mm); +extern const struct file_operations proc_pid_rss_threshold_operations; +#endif /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, diff --git a/include/linux/mm.h b/include/linux/mm.h index 6427828111d4..04588cad792d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,10 +2579,18 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) void mm_trace_rss_stat(struct mm_struct *mm, int member); +#ifdef CONFIG_RSS_THRESHOLD +void listen_rss_threshold(struct mm_struct *mm); +#endif + static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); +#ifdef CONFIG_RSS_THRESHOLD + listen_rss_threshold(mm); +#endif + mm_trace_rss_stat(mm, member); } @@ -2590,6 +2598,10 @@ static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); +#ifdef CONFIG_RSS_THRESHOLD + listen_rss_threshold(mm); +#endif + mm_trace_rss_stat(mm, member); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b2616008a49..98850f8ca34d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -853,6 +853,9 @@ struct mm_struct { unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ +#ifdef CONFIG_RSS_THRESHOLD + unsigned long rss_threshold; /* A threshold monitor RSS */ +#endif unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ diff --git a/kernel/fork.c b/kernel/fork.c index f51acd75f595..0a4ae22ca7b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1274,6 +1274,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); +#ifdef CONFIG_RSS_THRESHOLD + mm->rss_threshold = 0; +#endif atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); diff --git a/mm/Kconfig b/mm/Kconfig index 38bc0ee5ea6d..812f9a39bb4c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1258,6 +1258,21 @@ config ANON_VMA_NAME area from being merged with adjacent virtual memory areas due to the difference in their name. +# +# For lmkd to trigger in-kernel lowmem info +# +config LOWMEM + bool "Low Memory Killer" + default y + help + Enables lowmem killer parameter tuning + +config LMKD_DBG + bool "Low Memory Killer Debug" + default y + help + print processes info when lmk happen per several seconds + config USERFAULTFD bool "Enable userfaultfd() system call" depends on MMU @@ -1352,4 +1367,14 @@ config PURGEABLE_ASHMEM source "mm/damon/Kconfig" +# +# Use rss_threshold to monitoring RSS +# +config RSS_THRESHOLD + bool "Enable /proc//rss and /proc//rss_threshold to monitoring RSS" + default y + depends on PROC_FS && MEMCG + help + Set a threshold to monitoring RSS in per pid + endmenu diff --git a/mm/Makefile b/mm/Makefile index f84d4b0f521d..08b9a0a37a21 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -71,6 +71,9 @@ ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif +obj-$(CONFIG_LMKD_DBG) += lmkd_dbg_trigger.o +obj-$(CONFIG_LOWMEM) += lowmem_dbg.o +obj-$(CONFIG_RSS_THRESHOLD) += rss_threshold.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/lmkd_dbg_trigger.c b/mm/lmkd_dbg_trigger.c new file mode 100644 index 000000000000..bd97331e0e5a --- /dev/null +++ b/mm/lmkd_dbg_trigger.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lmkd_dbg_trigger.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define PROC_NUMBUF 8 + +static int lmkd_oom_score_adj; +static atomic64_t lmkd_no_cma_cnt = ATOMIC64_INIT(0); + +static int lmkd_dbg_trigger_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "lmkd_oom_score_adj: %d\n", lmkd_oom_score_adj); + seq_printf(m, "lmkd_no_cma_cnt: %lld\n", + atomic64_read(&lmkd_no_cma_cnt)); + return 0; +} + +static int lmkd_dbg_trigger_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, lmkd_dbg_trigger_proc_show, NULL); +} + +static ssize_t lmkd_dbg_trigger_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char buffer[PROC_NUMBUF]; + int oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); + if (err) + goto out; + + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + lmkd_oom_score_adj = oom_score_adj; + lowmem_dbg(oom_score_adj); + +out: + return err < 0 ? err : count; +} + +static const struct proc_ops lmkd_dbg_trigger_proc_fops = { + .proc_open = lmkd_dbg_trigger_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = lmkd_dbg_trigger_write, +}; + +static int __init proc_lmkd_dbg_trigger_init(void) +{ + proc_create("lmkd_dbg_trigger", 0660, NULL, + &lmkd_dbg_trigger_proc_fops); + return 0; +} + +fs_initcall(proc_lmkd_dbg_trigger_init); + diff --git a/mm/lowmem_dbg.c b/mm/lowmem_dbg.c new file mode 100644 index 000000000000..c5e4477a8d26 --- /dev/null +++ b/mm/lowmem_dbg.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/lowmem_dbg.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#define pr_fmt(fmt) "lowmem:" fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define LMK_PRT_TSK_RSS 0 +#define LMK_INTERVAL 15 + +/* SERVICE_ADJ(5) * OOM_SCORE_ADJ_MAX / -OOM_DISABLE */ +#define LMK_SERVICE_ADJ 1000 +/* defiine TASK STATE String */ +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" + +static unsigned long long last_jiffs; +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static void lowmem_dump(struct work_struct *work); + +static DEFINE_MUTEX(lowmem_dump_mutex); +static DECLARE_WORK(lowmem_dbg_verbose_wk, lowmem_dump); + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + +static void tasks_dump(bool verbose) +{ + struct task_struct *p = NULL; + struct task_struct *task = NULL; + short tsk_oom_adj = 0; + unsigned long tsk_nr_ptes = 0; + char frozen_mark = ' '; + + pr_info("[ pid ] uid tgid total_vm rss nptes swap adj s name\n"); + + rcu_read_lock(); + for_each_process(p) { + task = find_lock_task_mm(p); + if (!task) { + /* + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. + */ + continue; + } + + tsk_oom_adj = task->signal->oom_score_adj; + if (!verbose && tsk_oom_adj && + (tsk_oom_adj <= LMK_SERVICE_ADJ) && + (get_mm_rss(task->mm) < LMK_PRT_TSK_RSS)) { + task_unlock(task); + continue; + } + + tsk_nr_ptes = mm_pgtables_bytes(task->mm); + + frozen_mark = frozen(task) ? '*' : ' '; + + pr_info("[%5d] %5d %5d %8lu %6lu %5lu %5lu %5hd %c %s%c\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + tsk_nr_ptes, + get_mm_counter(task->mm, MM_SWAPENTS), + tsk_oom_adj, + task_state_char(task->__state), + task->comm, + frozen_mark); /*lint !e1058*/ + task_unlock(task); + } + rcu_read_unlock(); +} + +static void lowmem_dump(struct work_struct *work) +{ + bool verbose = (work == &lowmem_dbg_verbose_wk) ? true : false; + + mutex_lock(&lowmem_dump_mutex); + show_mem(); + tasks_dump(verbose); + mutex_unlock(&lowmem_dump_mutex); +} + +void lowmem_dbg(short oom_score_adj) +{ + unsigned long long jiffs = get_jiffies_64(); + + if (oom_score_adj == 0) { + schedule_work(&lowmem_dbg_verbose_wk); + } else if (time_after64(jiffs, (last_jiffs + LMK_INTERVAL * HZ))) { + last_jiffs = get_jiffies_64(); + schedule_work(&lowmem_dbg_verbose_wk); + } +} + diff --git a/mm/rss_threshold.c b/mm/rss_threshold.c new file mode 100644 index 000000000000..c9d915998dbd --- /dev/null +++ b/mm/rss_threshold.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/rss_threshold.c + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include "../fs/proc/internal.h" + +int proc_pid_rss(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + unsigned long total_rss; + + if (mm) { + total_rss = get_mm_rss(mm); + seq_printf(m, "VMRss:%lu KB\n", total_rss << (PAGE_SHIFT - 10)); + mmput(mm); + } + return 0; +} + +void listen_rss_threshold(struct mm_struct *mm) +{ + unsigned long total_rss; + + total_rss = get_mm_rss(mm); + + if (!(mm->owner) || mm->rss_threshold == 0) + return; + + total_rss = total_rss << (PAGE_SHIFT - 10); + + if (likely(total_rss <= mm->rss_threshold)) + return; + + if (mm->owner->comm) + pr_err("rss_threshold monitor:Pid:%d [%s] rss size:%lu KB is out of range:%lu KB\n", + mm->owner->pid, mm->owner->comm, + total_rss, + mm->rss_threshold); + else + pr_err("rss_threshold monitor:Pid:%d [NULL] rss size:%lu KB is out of range:%lu KB\n", + mm->owner->pid, + total_rss, + mm->rss_threshold); +} + +static ssize_t rss_threshold_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + struct mm_struct *mm = NULL; + unsigned long mem_total; + unsigned long rss_threshold; + int err; + + err = kstrtoul_from_user(buf, count, 0, &rss_threshold); + if (err < 0) + return err; + + mem_total = totalram_pages() << (PAGE_SHIFT - 10); + if (rss_threshold < 0 || rss_threshold > mem_total) + return -EINVAL; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + mm = get_task_mm(p); + if (mm) { + mm->rss_threshold = rss_threshold; + listen_rss_threshold(mm); + mmput(mm); + } + + put_task_struct(p); + + return count; +} + +static int rss_threshold_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + struct mm_struct *mm = NULL; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + mm = get_task_mm(p); + if (mm) { + seq_printf(m, "Threshold:%lu KB\n", mm->rss_threshold); + mmput(mm); + } + put_task_struct(p); + + return 0; +} + +static int rss_threshold_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, rss_threshold_show, inode); +} + +const struct file_operations proc_pid_rss_threshold_operations = { + .open = rss_threshold_open, + .read = seq_read, + .write = rss_threshold_write, + .llseek = seq_lseek, + .release = single_release, +}; -- Gitee From 5e0a9bfab96e7a772aa7e6b811edf621e55f629b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E6=9D=B0?= Date: Sun, 23 Feb 2025 18:00:03 +0800 Subject: [PATCH 2/2] dfx for acct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 陈杰 --- include/linux/reclaim_acct.h | 49 ++++++++ init/main.c | 6 + kernel/fork.c | 6 + mm/Kconfig | 7 ++ mm/Makefile | 1 + mm/internal.h | 25 ++++ mm/page_alloc.c | 15 +++ mm/reclaim_acct.c | 183 +++++++++++++++++++++++++++ mm/reclaimacct_show.c | 235 +++++++++++++++++++++++++++++++++++ mm/vmscan.c | 32 ++++- mm/zswapd.c | 5 +- 11 files changed, 561 insertions(+), 3 deletions(-) create mode 100644 include/linux/reclaim_acct.h create mode 100644 mm/reclaim_acct.c create mode 100644 mm/reclaimacct_show.c diff --git a/include/linux/reclaim_acct.h b/include/linux/reclaim_acct.h new file mode 100644 index 000000000000..e0da3cc48928 --- /dev/null +++ b/include/linux/reclaim_acct.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/reclaim_acct.h + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + + #ifndef _RECLAIM_ACCT_H + #define _RECLAIM_ACCT_H + + #include + #include + + /* RA is the abbreviation of reclaim accouting */ + enum reclaimacct_stubs { + RA_RECLAIM = 0, + RA_DRAINALLPAGES, + RA_SHRINKFILE, + RA_SHRINKANON, + RA_SHRINKSLAB, + NR_RA_STUBS + }; + + enum reclaim_type { + DIRECT_RECLAIMS = 0, + KSWAPD_RECLAIM, + ZSWAPD_RECLAIM, + RECLAIM_TYPES + }; + + #ifdef CONFIG_RECLAIM_ACCT + static inline bool is_system_reclaim(enum reclaim_type type) + { + return (type == KSWAPD_RECLAIM || type == ZSWAPD_RECLAIM); + } + + void reclaimacct_tsk_init(struct task_struct *tsk); + void reclaimacct_init(void); + + void reclaimacct_start(enum reclaim_type type, struct reclaim_acct *ra); + void reclaimacct_end(enum reclaim_type type); + + void reclaimacct_substage_start(enum reclaimacct_stubs stub); + void reclaimacct_substage_end(enum reclaimacct_stubs stub, unsigned long freed, + const struct shrinker *shrinker); + #endif + + #endif /* _RECLAIM_ACCT_H */ + \ No newline at end of file diff --git a/init/main.c b/init/main.c index c787e94cc898..75a435a88fb1 100644 --- a/init/main.c +++ b/init/main.c @@ -98,6 +98,9 @@ #include #include #include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #include #include #include @@ -1065,6 +1068,9 @@ void start_kernel(void) cgroup_init(); taskstats_init_early(); delayacct_init(); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_init(); +#endif acpi_subsystem_init(); arch_post_acpi_subsys_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 0a4ae22ca7b3..1bdcbd4bf344 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,9 @@ #include #include #include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #include #include #include @@ -2396,6 +2399,9 @@ __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_tsk_init(p); +#endif p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); diff --git a/mm/Kconfig b/mm/Kconfig index 812f9a39bb4c..cbf3f55f78d1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1377,4 +1377,11 @@ config RSS_THRESHOLD help Set a threshold to monitoring RSS in per pid +config RECLAIM_ACCT + bool "Memory reclaim delay accounting" + default y + help + Memory reclaim delay accounting. Never use it as a kernel module. + + endmenu diff --git a/mm/Makefile b/mm/Makefile index 08b9a0a37a21..262006a4e9b9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -147,3 +147,4 @@ obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o obj-$(CONFIG_PURGEABLE_ASHMEM) += purgeable_ashmem_trigger.o obj-$(CONFIG_MEMORY_MONITOR) += memory_monitor.o +obj-$(CONFIG_RECLAIM_ACCT) += reclaim_acct.o reclaimacct_show.o diff --git a/mm/internal.h b/mm/internal.h index 848d33206a72..1d524d4d5227 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1371,3 +1371,28 @@ struct vma_prepare { struct vm_area_struct *remove2; }; #endif /* __MM_INTERNAL_H */ + +#ifdef CONFIG_RECLAIM_ACCT +#define DELAY_LV0 5000000 /* 5ms */ +#define DELAY_LV1 10000000 /* 10ms */ +#define DELAY_LV2 50000000 /* 50ms */ +#define DELAY_LV3 100000000 /* 100ms */ +#define DELAY_LV4 2000000000 /* 2000ms */ +#define DELAY_LV5 50000000000 /* 50000ms */ +#define NR_DELAY_LV 6 + +struct reclaim_acct { + u64 start[NR_RA_STUBS]; + u64 delay[NR_RA_STUBS]; + u64 count[NR_RA_STUBS]; + u64 freed[NR_RA_STUBS]; + unsigned int reclaim_type; +}; + +bool reclaimacct_initialize_show_data(void); +void reclaimacct_reinitialize_show_data(void); +void reclaimacct_destroy_show_data(void); + +void reclaimacct_collect_data(void); +void reclaimacct_collect_reclaim_efficiency(void); +#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 256ef56f8c92..f6e4e51b8a16 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3714,7 +3714,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, */ if (!page && !drained) { unreserve_highatomic_pageblock(ac, false); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_DRAINALLPAGES); +#endif drain_all_pages(NULL); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_DRAINALLPAGES, 0, NULL); +#endif drained = true; goto retry; } @@ -3966,6 +3972,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct ra = {0}; +#endif restart: compaction_retries = 0; @@ -4106,8 +4115,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; /* Try direct reclaim and then allocating */ +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(DIRECT_RECLAIMS, &ra); +#endif page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(DIRECT_RECLAIMS); +#endif if (page) goto got_pg; diff --git a/mm/reclaim_acct.c b/mm/reclaim_acct.c new file mode 100644 index 000000000000..fe998ca647e5 --- /dev/null +++ b/mm/reclaim_acct.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/reclaim_acct.c + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + + #include + #include + + #include "internal.h" + + + const char *stub_name[NR_RA_STUBS] = { + "direct_reclaim", + "drain_all_pages", + "shrink_file_list", + "shrink_anon_list", + "shrink_slab", + }; + + /* Once initialized, the variable should never be changed */ + static bool reclaimacct_is_off = true; + static int reclaimacct_disable = 1; + + static void reclaimacct_free(struct reclaim_acct *ra, enum reclaim_type type) + { + memset(ra, 0, sizeof(struct reclaim_acct)); + } + + static void __reclaimacct_end(struct reclaim_acct *ra, u64 freed, + enum reclaimacct_stubs stub, const struct shrinker *shrinker) + { + u64 now, delay, start; + + start = ra->start[stub]; + now = ktime_get_ns(); + if (now < start) + return; + + delay = now - start; + if (delay < DELAY_LV5 || is_system_reclaim(ra->reclaim_type)) { + ra->delay[stub] += delay; + ra->count[stub]++; + ra->freed[stub] += freed; + } + + if (delay > DELAY_LV4 && delay < DELAY_LV5) { + pr_warn_ratelimited("%s timeout:%llu\n", stub_name[stub], delay); + if (shrinker) + pr_warn_ratelimited("shrinker = %pF\n", shrinker); + } + } + + void reclaimacct_tsk_init(struct task_struct *tsk) + { + if (tsk) + tsk->reclaim_acct = NULL; + } + + /* Reinitialize in case parent's non-null pointer was duped */ + void reclaimacct_init(void) + { + reclaimacct_tsk_init(&init_task); + } + + void reclaimacct_substage_start(enum reclaimacct_stubs stub) + { + if (!current->reclaim_acct) + return; + + current->reclaim_acct->start[stub] = ktime_get_ns(); + } + + void reclaimacct_substage_end(enum reclaimacct_stubs stub, unsigned long freed, + const struct shrinker *shrinker) + { + if (!current->reclaim_acct) + return; + + __reclaimacct_end(current->reclaim_acct, freed, stub, shrinker); + } + + static void reclaimacct_directreclaim_end(struct reclaim_acct *ra) + { + int i; + + if (ra->delay[RA_RECLAIM] > DELAY_LV4) { + pr_warn_ratelimited("Summary"); + for (i = 0; i < NR_RA_STUBS; i++) + pr_warn_ratelimited(" %s=%llu %llu", stub_name[i], + ra->delay[i], ra->count[i]); + pr_warn_ratelimited("\n"); + } + + reclaimacct_collect_data(); + reclaimacct_free(ra, ra->reclaim_type); + } + + static void reclaimacct_system_reclaim_end(struct reclaim_acct *ra) + { + reclaimacct_free(ra, ra->reclaim_type); + } + + void reclaimacct_start(enum reclaim_type type, struct reclaim_acct *ra) + { + if (reclaimacct_disable || reclaimacct_is_off) + return; + + if (!current->reclaim_acct) + current->reclaim_acct = ra; + + current->reclaim_acct->reclaim_type = type; + current->reclaim_acct->start[RA_RECLAIM] = ktime_get_ns(); + } + + void reclaimacct_end(enum reclaim_type type) + { + if (!current->reclaim_acct) + return; + + __reclaimacct_end(current->reclaim_acct, 0, RA_RECLAIM, NULL); + + reclaimacct_collect_reclaim_efficiency(); + + if (is_system_reclaim(type)) + reclaimacct_system_reclaim_end(current->reclaim_acct); + else + reclaimacct_directreclaim_end(current->reclaim_acct); + + current->reclaim_acct = NULL; + } + + /* Reclaim accounting module initialize */ + static int reclaimacct_init_handle(void *p) + { + if (!reclaimacct_initialize_show_data()) + goto alloc_show_failed; + + reclaimacct_is_off = false; + pr_info("enabled\n"); + return 0; + + alloc_show_failed: + reclaimacct_is_off = true; + pr_err("disabled\n"); + return 0; + } + + static int __init reclaimacct_module_init(void) + { + struct task_struct *task = NULL; + + task = kthread_run(reclaimacct_init_handle, NULL, "reclaimacct_init"); + if (IS_ERR(task)) + pr_err("run reclaimacct_init failed\n"); + else + pr_info("run reclaimacct_init successfully\n"); + return 0; + } + + late_initcall(reclaimacct_module_init); + + static int reclaimacct_disable_set(const char *val, const struct kernel_param *kp) + { + int ret; + + ret = param_set_int(val, kp); + if (ret) + return ret; + + if (!reclaimacct_disable) + reclaimacct_reinitialize_show_data(); + return 0; + } + + static const struct kernel_param_ops reclaimacct_disable_ops = { + .set = reclaimacct_disable_set, + .get = param_get_int, + }; + + module_param_cb(disable, &reclaimacct_disable_ops, &reclaimacct_disable, 0644); + \ No newline at end of file diff --git a/mm/reclaimacct_show.c b/mm/reclaimacct_show.c new file mode 100644 index 000000000000..df777928daa1 --- /dev/null +++ b/mm/reclaimacct_show.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/reclaimacct_show.c + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + + #include + #include + #include + #include + #include + + #include "internal.h" + + /* Store reclaim accounting data */ + static struct reclaimacct_show { + u64 delay[NR_DELAY_LV][NR_RA_STUBS]; + u64 count[NR_DELAY_LV][NR_RA_STUBS]; + u64 max_delay; + u64 max_delay_time; + } *ra_show; + static DEFINE_SPINLOCK(ra_show_lock); + + static struct reclaim_efficiency { + u64 time[NR_RA_STUBS]; + u64 freed[NR_RA_STUBS]; + } *ra_eff; + static DEFINE_SPINLOCK(ra_eff_lock); + + bool reclaimacct_initialize_show_data(void) + { + ra_show = kzalloc(sizeof(struct reclaimacct_show), GFP_KERNEL); + if (!ra_show) + goto fail_show; + + ra_eff = kzalloc(sizeof(struct reclaim_efficiency) * RECLAIM_TYPES, GFP_KERNEL); + if (!ra_eff) + goto fail_eff; + return true; + + fail_eff: + kfree(ra_show); + ra_show = NULL; + + fail_show: + return false; + } + + void reclaimacct_reinitialize_show_data(void) + { + if (ra_show) + memset(ra_show, 0, sizeof(struct reclaimacct_show)); + + if (ra_eff) + memset(ra_eff, 0, sizeof(struct reclaim_efficiency) * RECLAIM_TYPES); + } + + void reclaimacct_destroy_show_data(void) + { + kfree(ra_show); + ra_show = NULL; + + kfree(ra_eff); + ra_eff = NULL; + } + + static void __reclaimacct_collect_data(int level, struct reclaim_acct *ra) + { + int i; + + spin_lock(&ra_show_lock); + for (i = 0; i < NR_RA_STUBS; i++) { + ra_show->delay[level][i] += ra->delay[i]; + ra_show->count[level][i] += ra->count[i]; + } + + if (ra->delay[RA_RECLAIM] > ra_show->max_delay) { + ra_show->max_delay = ra->delay[RA_RECLAIM]; + ra_show->max_delay_time = sched_clock(); + } + spin_unlock(&ra_show_lock); + } + + void reclaimacct_collect_data(void) + { + int i; + const u64 delay[NR_DELAY_LV] = { + DELAY_LV0, DELAY_LV1, DELAY_LV2, DELAY_LV3, DELAY_LV4, DELAY_LV5 + }; + + if (!ra_show || !current->reclaim_acct) + return; + + for (i = 0; i < NR_DELAY_LV; i++) { + if (current->reclaim_acct->delay[RA_RECLAIM] < delay[i]) { + __reclaimacct_collect_data(i, current->reclaim_acct); + break; + } + } + } + + static int reclaimacct_proc_show(struct seq_file *m, void *v) + { + int i, j; + struct reclaimacct_show show; + const char *stub_name[NR_RA_STUBS] = { + "direct_reclaim", + "drain_all_pages", + "shrink_file_list", + "shrink_anon_list", + "shrink_slab", + }; + + if (!ra_show) + return 0; + + spin_lock(&ra_show_lock); + memcpy(&show, ra_show, sizeof(struct reclaimacct_show)); + spin_unlock(&ra_show_lock); + + seq_puts(m, "watch_point(unit:ms/-)\t\t0-5ms\t\t5-10ms\t\t"); + seq_puts(m, "10-50ms\t\t50-100ms\t100-2000ms\t2000-50000ms\n"); + for (i = 0; i < NR_RA_STUBS; i++) { + seq_printf(m, "%s_delay\t\t", stub_name[i]); + for (j = 0; j < NR_DELAY_LV; j++) + seq_printf(m, "%-15llu ", div_u64(show.delay[j][i], NSEC_PER_MSEC)); + seq_puts(m, "\n"); + + seq_printf(m, "%s_count\t\t", stub_name[i]); + for (j = 0; j < NR_DELAY_LV; j++) + seq_printf(m, "%-15llu ", show.count[j][i]); + seq_puts(m, "\n"); + } + seq_printf(m, "Max delay: %llu\tHappened: %llu\n", show.max_delay, show.max_delay_time); + + return 0; + } + + static int reclaimacct_proc_open(struct inode *inode, struct file *file) + { + return single_open(file, reclaimacct_proc_show, NULL); + } + + static const struct proc_ops reclaimacct_proc_fops = { + .proc_open = reclaimacct_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + }; + + static void __reclaimacct_collect_reclaim_efficiency( + struct reclaim_acct *ra, enum reclaim_type type) + { + int i; + + ra->freed[RA_RECLAIM] = ra->freed[RA_SHRINKFILE] + ra->freed[RA_SHRINKANON]; + + /* system_reclaim(kswapd/zswapd) is single thread, do not need lock */ + if (!is_system_reclaim(type)) + spin_lock(&ra_eff_lock); + + for (i = 0; i < NR_RA_STUBS; i++) { + ra_eff[type].time[i] += ra->delay[i]; + ra_eff[type].freed[i] += ra->freed[i]; + } + + if (!is_system_reclaim(type)) + spin_unlock(&ra_eff_lock); + } + + void reclaimacct_collect_reclaim_efficiency(void) + { + if (!ra_eff || !current->reclaim_acct) + return; + + __reclaimacct_collect_reclaim_efficiency(current->reclaim_acct, + current->reclaim_acct->reclaim_type); + } + + static int reclaim_efficiency_proc_show(struct seq_file *m, void *v) + { + int i, j; + struct reclaim_efficiency eff[RECLAIM_TYPES]; + const char *stage[NR_RA_STUBS] = { + "total_process", + "drain_pages ", + "shrink_file ", + "shrink_anon ", + "shrink_slab " + }; + const char *type[RECLAIM_TYPES] = { + "direct reclaim", + "kswapd ", + "zswapd " + }; + + if (!ra_eff) + return 0; + + spin_lock(&ra_eff_lock); + memcpy(&eff, ra_eff, sizeof(eff)); + spin_unlock(&ra_eff_lock); + + for (i = 0; i < RECLAIM_TYPES; i++) { + seq_printf(m, "%s time(ms) freed(page/obj)\n", type[i]); + for (j = 0; j < NR_RA_STUBS; j++) + seq_printf(m, "%s %-15llu %-15llu\n", stage[j], + div_u64(eff[i].time[j], NSEC_PER_MSEC), + eff[i].freed[j]); + } + + return 0; + } + + static int reclaim_efficiency_proc_open(struct inode *inode, struct file *file) + { + return single_open(file, reclaim_efficiency_proc_show, NULL); + } + + static const struct proc_ops reclaim_effi_proc_fops = { + .proc_open = reclaim_efficiency_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + }; + + static int __init proc_reclaimacct_init(void) + { + proc_create("reclaimacct", 0440, NULL, &reclaimacct_proc_fops); + proc_create("reclaim_efficiency", 0440, NULL, &reclaim_effi_proc_fops); + return 0; + } + fs_initcall(proc_reclaimacct_init); + \ No newline at end of file diff --git a/mm/vmscan.c b/mm/vmscan.c index 94189499081f..6567a5ba5eb0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -992,10 +992,16 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_SHRINKSLAB); +#endif ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKSLAB, ret, shrinker); +#endif /* * Bail out if someone want to register a new shrinker to * prevent the registration from being stalled for long periods @@ -2797,15 +2803,31 @@ unsigned long reclaim_pages(struct list_head *folio_list) unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { +#ifdef CONFIG_RECLAIM_ACCT + unsigned long nr_reclaimed; + unsigned int stub; + + stub = is_file_lru(lru) ? RA_SHRINKFILE : RA_SHRINKANON; + reclaimacct_substage_start(stub); +#endif if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(stub, 0, NULL); +#endif return 0; } +#ifdef CONFIG_RECLAIM_ACCT + nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru); + reclaimacct_substage_end(stub, nr_reclaimed, NULL); + return nr_reclaimed; +#else return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +#endif } /* @@ -7735,7 +7757,9 @@ static int kswapd(void *p) pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct ra = {0}; +#endif if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); @@ -7798,9 +7822,15 @@ static int kswapd(void *p) alloc_order); #ifdef CONFIG_MEMORY_MONITOR kswapd_monitor_wake_up_queue(); +#endif +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(KSWAPD_RECLAIM, &ra); #endif reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(KSWAPD_RECLAIM); +#endif if (reclaim_order < alloc_order) goto kswapd_try_sleep; } diff --git a/mm/zswapd.c b/mm/zswapd.c index d80a00d9f1fd..52d79743632a 100644 --- a/mm/zswapd.c +++ b/mm/zswapd.c @@ -535,7 +535,6 @@ static unsigned long zswapd_shrink_list(enum lru_list lru, { #ifdef CONFIG_RECLAIM_ACCT unsigned long nr_reclaimed; - reclaimacct_substage_start(RA_SHRINKANON); #endif if (is_active_lru(lru)) { @@ -566,7 +565,9 @@ static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, unsigned long nr_to_scan; struct blk_plug plug; enum lru_list lru; - +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct ra = {0}; +#endif blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { -- Gitee