diff --git a/include/linux/sched.h b/include/linux/sched.h index 19a464e0abedef25081b40f2752c2e8ca689bd8e..b6a1a2e6ce75601a0eafc5b87015d7ec9424bf42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -58,6 +58,9 @@ struct perf_event_context; struct pid_namespace; struct pipe_inode_info; struct rcu_node; +#ifdef CONFIG_RECLAIM_ACCT +struct reclaim_acct; +#endif struct reclaim_state; struct robust_list_head; struct root_domain; @@ -1458,6 +1461,10 @@ struct task_struct { struct task_delay_info *delays; #endif +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct *reclaim_acct; +#endif + #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; diff --git a/init/main.c b/init/main.c index c787e94cc8982b6209bd542744663944e88cc021..20fe61afe1959018c25f2b30369b2e7294745661 100644 --- a/init/main.c +++ b/init/main.c @@ -101,6 +101,9 @@ #include #include #include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #include #include @@ -1065,6 +1068,9 @@ void start_kernel(void) cgroup_init(); taskstats_init_early(); delayacct_init(); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_init(); +#endif acpi_subsystem_init(); arch_post_acpi_subsys_init(); diff --git a/kernel/fork.c b/kernel/fork.c index f51acd75f59571ca0ff35adc2337189fb09f52a3..0a0cfffe05f8b75938887dc738b5cf4ea3ac8d7b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -103,6 +103,10 @@ #ifdef CONFIG_MEM_PURGEABLE #include #endif +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif + #include #include #include @@ -2393,6 +2397,9 @@ __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_tsk_init(p); +#endif p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); diff --git a/mm/Kconfig b/mm/Kconfig index 38bc0ee5ea6d0404a04df9235c2d25f6a453c736..11f2f83389e57cc2ed350c11130eabfb15bd0c40 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -516,6 +516,12 @@ config HYPERHOLD_ZSWAPD and the refault of anonymous pages is high, the content of zram will exchanged to eswap by a certain percentage. +config RECLAIM_ACCT + bool "Memory reclaim delay accounting" + default n + help + Memory reclaim delay accounting. Never use it as a kernel module. + config SPARSEMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index f84d4b0f521de445aa5a56324ee640e6448971ce..94ea4fb431637fdb8c465637ddc4211279e1102d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o +obj-$(CONFIG_RECLAIM_ACCT) += reclaim_acct.o reclaimacct_show.o obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o obj-$(CONFIG_PURGEABLE_ASHMEM) += purgeable_ashmem_trigger.o obj-$(CONFIG_MEMORY_MONITOR) += memory_monitor.o diff --git a/mm/internal.h b/mm/internal.h index 848d33206a723b8647d874f201b60c44e898790c..b72d4137f06ac2eb6126ffd7cadb2056edfe7bc7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1106,6 +1106,36 @@ struct migration_target_control { gfp_t gfp_mask; }; +#define DELAY_LV0 5000000 /* 5ms */ +#define DELAY_LV1 10000000 /* 10ms */ +#define DELAY_LV2 50000000 /* 50ms */ +#define DELAY_LV3 100000000 /* 100ms */ +#define DELAY_LV4 2000000000 /* 2000ms */ +#define DELAY_LV5 50000000000 /* 50000ms */ +#define NR_DELAY_LV 6 + +struct reclaim_acct { + u64 start[NR_RA_STUBS]; + u64 delay[NR_RA_STUBS]; + u64 count[NR_RA_STUBS]; + u64 freed[NR_RA_STUBS]; + unsigned int reclaim_type; +}; + +static const char *stub_name[NR_RA_STUBS] = { + "direct_reclaim", + "drain_all_pages", + "shrink_file_list", + "shrink_anon_list", + "shrink_slab", +}; + +bool reclaimacct_initialize_show_data(void); +void reclaimacct_destroy_show_data(void); + +void reclaimacct_collect_data(void); +void reclaimacct_collect_reclaim_efficiency(void); + /* * mm/filemap.c */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 256ef56f8c927ea9e03869fee7093da7194e2a04..09d01f6157bce0722d3c4b4990ed856518a93854 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3714,7 +3714,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, */ if (!page && !drained) { unreserve_highatomic_pageblock(ac, false); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_DRAINALLPAGES); +#endif drain_all_pages(NULL); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_DRAINALLPAGES, 0, NULL); +#endif drained = true; goto retry; } @@ -3966,6 +3972,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + struct reclaim_acct ra = {0}; restart: compaction_retries = 0; @@ -4106,8 +4113,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; /* Try direct reclaim and then allocating */ +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(DIRECT_RECLAIMS, &ra); +#endif page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(DIRECT_RECLAIMS); +#endif if (page) goto got_pg; diff --git a/mm/reclaim_acct.c b/mm/reclaim_acct.c new file mode 100644 index 0000000000000000000000000000000000000000..5f1a06fc83378402f5021e2e3d4671e3120cc2cb --- /dev/null +++ b/mm/reclaim_acct.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/reclaim_acct.c + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + +#include +#include + +#include "internal.h" + +/* Once initialized, the variable should never be changed */ +static bool reclaimacct_is_off = true; +static int reclaimacct_disable = 1; + +static void reclaimacct_free(struct reclaim_acct *ra, enum reclaim_type type) +{ + memset(ra, 0, sizeof(struct reclaim_acct)); +} + +static void __reclaimacct_end(struct reclaim_acct *ra, u64 freed, + enum reclaimacct_stubs stub, const struct shrinker *shrinker) +{ + u64 now, delay, start; + + start = ra->start[stub]; + now = ktime_get_ns(); + if (now < start) + return; + + delay = now - start; + if (delay < DELAY_LV5 || is_system_reclaim(ra->reclaim_type)) { + ra->delay[stub] += delay; + ra->count[stub]++; + ra->freed[stub] += freed; + } + + if (delay > DELAY_LV4 && delay < DELAY_LV5) { + pr_warn_ratelimited("%s timeout:%llu\n", stub_name[stub], delay); + if (shrinker) + pr_warn_ratelimited("shrinker = %pF\n", shrinker); + } +} + +void reclaimacct_tsk_init(struct task_struct *tsk) +{ + if (tsk) + tsk->reclaim_acct = NULL; +} + +/* Reinitialize in case parent's non-null pointer was duped */ +void reclaimacct_init(void) +{ + reclaimacct_tsk_init(&init_task); +} + +void reclaimacct_substage_start(enum reclaimacct_stubs stub) +{ + if (!current->reclaim_acct) + return; + + current->reclaim_acct->start[stub] = ktime_get_ns(); +} + +void reclaimacct_substage_end(enum reclaimacct_stubs stub, unsigned long freed, + const struct shrinker *shrinker) +{ + if (!current->reclaim_acct) + return; + + __reclaimacct_end(current->reclaim_acct, freed, stub, shrinker); +} + +static void reclaimacct_directreclaim_end(struct reclaim_acct *ra) +{ + int i; + + if (ra->delay[RA_RECLAIM] > DELAY_LV4) { + pr_warn_ratelimited("Summary"); + for (i = 0; i < NR_RA_STUBS; i++) + pr_warn_ratelimited(" %s=%llu %llu", stub_name[i], + ra->delay[i], ra->count[i]); + pr_warn_ratelimited("\n"); + } + + reclaimacct_collect_data(); + reclaimacct_free(ra, ra->reclaim_type); + current->reclaim_acct = NULL; +} + +static void reclaimacct_system_reclaim_end(struct reclaim_acct *ra) +{ + reclaimacct_free(ra, ra->reclaim_type); +} + +void reclaimacct_start(enum reclaim_type type, struct reclaim_acct *ra) +{ + if (reclaimacct_disable || reclaimacct_is_off) + return; + + if (!current->reclaim_acct) + current->reclaim_acct = ra; + + current->reclaim_acct->reclaim_type = type; + current->reclaim_acct->start[RA_RECLAIM] = ktime_get_ns(); +} + +void reclaimacct_end(enum reclaim_type type) +{ + if (!current->reclaim_acct) + return; + + __reclaimacct_end(current->reclaim_acct, 0, RA_RECLAIM, NULL); + + reclaimacct_collect_reclaim_efficiency(); + + if (is_system_reclaim(type)) + reclaimacct_system_reclaim_end(current->reclaim_acct); + else + reclaimacct_directreclaim_end(current->reclaim_acct); +} + +/* Reclaim accounting module initialize */ +static int reclaimacct_init_handle(void *p) +{ + int i; + + if (!reclaimacct_initialize_show_data()) + goto alloc_show_failed; + + reclaimacct_is_off = false; + pr_info("enabled\n"); + return 0; + +alloc_show_failed: + reclaimacct_is_off = true; + pr_err("disabled\n"); + return 0; +} + +static int __init reclaimacct_module_init(void) +{ + struct task_struct *task = NULL; + + task = kthread_run(reclaimacct_init_handle, NULL, "reclaimacct_init"); + if (IS_ERR(task)) + pr_err("run reclaimacct_init failed\n"); + else + pr_info("run reclaimacct_init successfully\n"); + return 0; +} + +late_initcall(reclaimacct_module_init); + +module_param_named(disable, reclaimacct_disable, int, 0644); diff --git a/mm/reclaimacct_show.c b/mm/reclaimacct_show.c new file mode 100644 index 0000000000000000000000000000000000000000..5b5a621c277173e12746f3e5a2b72707ce9608dc --- /dev/null +++ b/mm/reclaimacct_show.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/reclaimacct_show.c + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Store reclaim accounting data */ +static struct reclaimacct_show { + u64 delay[NR_DELAY_LV][NR_RA_STUBS]; + u64 count[NR_DELAY_LV][NR_RA_STUBS]; + u64 max_delay; + u64 max_delay_time; +} *ra_show; +static DEFINE_SPINLOCK(ra_show_lock); + +static struct reclaim_efficiency { + u64 time[NR_RA_STUBS]; + u64 freed[NR_RA_STUBS]; +} *ra_eff; +static DEFINE_SPINLOCK(ra_eff_lock); + +bool reclaimacct_initialize_show_data(void) +{ + ra_show = kzalloc(sizeof(struct reclaimacct_show), GFP_KERNEL); + if (!ra_show) + goto fail_show; + + ra_eff = kzalloc(sizeof(struct reclaim_efficiency) * RECLAIM_TYPES, GFP_KERNEL); + if (!ra_eff) + goto fail_eff; + return true; + +fail_eff: + kfree(ra_show); + ra_show = NULL; + +fail_show: + return false; +} + +void reclaimacct_destroy_show_data(void) +{ + kfree(ra_show); + ra_show = NULL; + + kfree(ra_eff); + ra_eff = NULL; +} + +static void __reclaimacct_collect_data(int level, struct reclaim_acct *ra) +{ + int i; + + spin_lock(&ra_show_lock); + for (i = 0; i < NR_RA_STUBS; i++) { + ra_show->delay[level][i] += ra->delay[i]; + ra_show->count[level][i] += ra->count[i]; + } + + if (ra->delay[RA_RECLAIM] > ra_show->max_delay) { + ra_show->max_delay = ra->delay[RA_RECLAIM]; + ra_show->max_delay_time = sched_clock(); + } + spin_unlock(&ra_show_lock); +} + +void reclaimacct_collect_data(void) +{ + int i; + const u64 delay[NR_DELAY_LV] = { + DELAY_LV0, DELAY_LV1, DELAY_LV2, DELAY_LV3, DELAY_LV4, DELAY_LV5 + }; + + if (!ra_show || !current->reclaim_acct) + return; + + for (i = 0; i < NR_DELAY_LV; i++) { + if (current->reclaim_acct->delay[RA_RECLAIM] < delay[i]) { + __reclaimacct_collect_data(i, current->reclaim_acct); + break; + } + } +} + +static int reclaimacct_proc_show(struct seq_file *m, void *v) +{ + int i, j; + struct reclaimacct_show show; + + if (!ra_show) + return 0; + + spin_lock(&ra_show_lock); + memcpy(&show, ra_show, sizeof(struct reclaimacct_show)); + spin_unlock(&ra_show_lock); + + seq_puts(m, "watch_point(unit:ms/-)\t\t0-5ms\t\t5-10ms\t\t"); + seq_puts(m, "10-50ms\t\t50-100ms\t100-2000ms\t2000-50000ms\n"); + for (i = 0; i < NR_RA_STUBS; i++) { + seq_printf(m, "%s_delay\t\t", stub_name[i]); + for (j = 0; j < NR_DELAY_LV; j++) + seq_printf(m, "%-15llu ", show.delay[j][i] / NSEC_PER_MSEC); + seq_puts(m, "\n"); + + seq_printf(m, "%s_count\t\t", stub_name[i]); + for (j = 0; j < NR_DELAY_LV; j++) + seq_printf(m, "%-15llu ", show.count[j][i]); + seq_puts(m, "\n"); + } + seq_printf(m, "Max delay: %llu\tHappened: %llu\n", show.max_delay, show.max_delay_time); + + return 0; +} + +static int reclaimacct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, reclaimacct_proc_show, NULL); +} + +static const struct proc_ops reclaimacct_proc_fops = { + .proc_open = reclaimacct_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static void __reclaimacct_collect_reclaim_efficiency( + struct reclaim_acct *ra, enum reclaim_type type) +{ + int i; + + ra->freed[RA_RECLAIM] = ra->freed[RA_SHRINKFILE] + ra->freed[RA_SHRINKANON]; + + /* system_reclaim(kswapd/zswapd) is single thread, do not need lock */ + if (!is_system_reclaim(type)) + spin_lock(&ra_eff_lock); + + for (i = 0; i < NR_RA_STUBS; i++) { + ra_eff[type].time[i] += ra->delay[i]; + ra_eff[type].freed[i] += ra->freed[i]; + } + + if (!is_system_reclaim(type)) + spin_unlock(&ra_eff_lock); +} + +void reclaimacct_collect_reclaim_efficiency(void) +{ + if (!ra_eff || !current->reclaim_acct) + return; + + __reclaimacct_collect_reclaim_efficiency(current->reclaim_acct, + current->reclaim_acct->reclaim_type); +} + +static int reclaim_efficiency_proc_show(struct seq_file *m, void *v) +{ + int i, j; + struct reclaim_efficiency eff[RECLAIM_TYPES]; + const char *stage[NR_RA_STUBS] = { + "total_process", + "drain_pages ", + "shrink_file ", + "shrink_anon ", + "shrink_slab " + }; + const char *type[RECLAIM_TYPES] = { + "direct reclaim", + "kswapd ", + "zswapd " + }; + + if (!ra_eff) + return 0; + + spin_lock(&ra_eff_lock); + memcpy(&eff, ra_eff, sizeof(eff)); + spin_unlock(&ra_eff_lock); + + for (i = 0; i < RECLAIM_TYPES; i++) { + seq_printf(m, "%s time(ms) freed(page/obj)\n", type[i]); + for (j = 0; j < NR_RA_STUBS; j++) + seq_printf(m, "%s %-15llu %-15llu\n", stage[j], + eff[i].time[j] / NSEC_PER_MSEC, + eff[i].freed[j]); + } + + return 0; +} + +static int reclaim_efficiency_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, reclaim_efficiency_proc_show, NULL); +} + +static const struct proc_ops reclaim_effi_proc_fops = { + .proc_open = reclaim_efficiency_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int __init proc_reclaimacct_init(void) +{ + proc_create("reclaimacct", 0440, NULL, &reclaimacct_proc_fops); + proc_create("reclaim_efficiency", 0440, NULL, &reclaim_effi_proc_fops); + return 0; +} +fs_initcall(proc_reclaimacct_init); diff --git a/mm/vmscan.c b/mm/vmscan.c index 94189499081f7f6bade9e266fb74e55976cf2667..fac9b9e9177c74a45eff6dda3fef567eb8a9e659 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -992,10 +992,16 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_SHRINKSLAB); +#endif ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKSLAB, ret, shrinker); +#endif /* * Bail out if someone want to register a new shrinker to * prevent the registration from being stalled for long periods @@ -2797,15 +2803,31 @@ unsigned long reclaim_pages(struct list_head *folio_list) unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { +#ifdef CONFIG_RECLAIM_ACCT + unsigned long nr_reclaimed; + unsigned int stub; + + stub = is_file_lru(lru) ? RA_SHRINKFILE : RA_SHRINKANON; + reclaimacct_substage_start(stub); +#endif if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(stub, 0, NULL); +#endif return 0; } +#ifdef CONFIG_RECLAIM_ACCT + nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru); + reclaimacct_substage_end(stub, nr_reclaimed, NULL); + return nr_reclaimed; +#else return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +#endif } /* @@ -7735,6 +7757,7 @@ static int kswapd(void *p) pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + struct reclaim_acct ra = {0}; if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); @@ -7798,9 +7821,15 @@ static int kswapd(void *p) alloc_order); #ifdef CONFIG_MEMORY_MONITOR kswapd_monitor_wake_up_queue(); +#endif +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(KSWAPD_RECLAIM, &ra); #endif reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(KSWAPD_RECLAIM); +#endif if (reclaim_order < alloc_order) goto kswapd_try_sleep; }