From c4069ab39e78c5964f8163c80eacfe8b380bbcaa Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 4 Jan 2024 07:33:09 +0000 Subject: [PATCH 01/10] sched/psi: Bail out early from irq time accounting mainline inclusion from mainline-v6.7-rc1 commit 0c2924079f5a83ed715630680e338b3685a0bf7d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c2924079f5a83ed715630680e338b3685a0bf7d -------------------------------- We could bail out early when psi was disabled. Signed-off-by: Haifeng Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Link: https://lore.kernel.org/r/20230926115722.467833-1-haifeng.xu@shopee.com Signed-off-by: Lu Jialin --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..e5dd5656775e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1009,6 +1009,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; -- Gitee From 2dfc29e742a07863c2426aea03d5c2c97b558b9d Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:10 +0000 Subject: [PATCH 02/10] sched/psi: Export cgroup psi from cgroupv2 to cgroupv1 hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ---------------------------------------------- Export cgroup psi(cpu/memory/io/irq) from cgroup v2 to cgroupv1. We attach cgroupv1 psi with cpuacct subsystem, referring to (0c58735850e5: "ck: psi: Support PSI under cgroup v1")[1]. To make cgroupv1 psi more meaningful and accurate, processes should have the same management in cpu/io/memory/cpuacct subsystem, which is similar with the process management in cgroup v2. [1] https://github.com/alibaba/cloud-kernel/commit/0c58735850e5ae4d98680a815eec8ef0ff0d5bae Signed-off-by: Lu Jialin --- include/linux/psi.h | 3 +++ init/Kconfig | 10 ++++++++++ kernel/cgroup/cgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/sched/cpuacct.c | 25 +++++++++++++++++++++++++ kernel/sched/psi.c | 27 ++++++++++++++++++++++++++- 5 files changed, 103 insertions(+), 1 deletion(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index e0745873e3f2..9e5d49cf62d5 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -15,6 +15,9 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +#ifdef CONFIG_PSI_CGROUP_V1 +extern struct static_key_true psi_v1_disabled; +#endif extern struct psi_group psi_system; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index 869eea4108d0..45aea89c163e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -667,6 +667,16 @@ config PSI_DEFAULT_DISABLED Say N if unsure. +config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default n + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cfae217e6e7f..24a657d130c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3919,6 +3919,45 @@ bool cgroup_psi_enabled(void) return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } +#ifdef CONFIG_PSI_CGROUP_V1 +struct cftype cgroup_v1_psi_files[] = { + { + .name = "io.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { } /* terminate */ +}; +#endif #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..2a1e44822bc1 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -361,3 +361,28 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +#ifdef CONFIG_PSI_CGROUP_V1 +extern struct cftype cgroup_v1_psi_files[]; + +static int __init setup_psi_v1(char *str) +{ + if (!strcmp(str, "1")) + static_branch_disable(&psi_v1_disabled); + + return 1; +} +__setup("psi_v1=", setup_psi_v1); + +static int __init cgroup_v1_psi_init(void) +{ + if (static_branch_likely(&psi_v1_disabled)) + return 0; + + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); + + return 0; +} + +late_initcall_sync(cgroup_v1_psi_init); +#endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e5dd5656775e..eb673c846a61 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,6 +140,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_TRUE(psi_v1_disabled); static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED @@ -881,11 +882,35 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } +#if defined(CONFIG_CGROUP_CPUACCT) && defined(CONFIG_PSI_CGROUP_V1) +static bool task_is_in_psi_v1(void) +{ + if (static_branch_likely(&psi_v1_disabled)) + return false; + + return !cgroup_subsys_on_dfl(cpuacct_cgrp_subsys); +} +#else +static bool task_is_in_psi_v1(void) +{ + return false; +} +#endif + static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) + if (static_branch_likely(&psi_cgroups_enabled)) { + if (task_is_in_psi_v1()) { + struct cgroup *cgroup; + + rcu_read_lock(); + cgroup = task_cgroup(task, cpuacct_cgrp_id); + rcu_read_unlock(); + return cgroup_psi(cgroup); + } return cgroup_psi(task_dfl_cgroup(task)); + } #endif return &psi_system; } -- Gitee From 5516338c2c0bd5a651e626ad226a52652af7a58c Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:19 +0000 Subject: [PATCH 03/10] sched/psi: update psi irqtime when the irq delta is nozero hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- If update psi irqtime whether the irq delta is zero or not, the performance will be degradation when update_rq_clock_task works frequently. Therefore, just update psi irqtime whether the irq delta is nozero. performace test of times: 1) without psi_account_irqtime in update_rq_clock_task [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.45210 188 0 500 2) psi_account_irqtime in update_rq_clock_task [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.49408 196 0 500 3) psi_account_irqtime in update_rq_clock_task when irq delta is nozero [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.45158 195 0 500 Signed-off-by: Lu Jialin --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a0997e7e136..b5b5dd1ebd6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,7 +722,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); + if (irq_delta) + psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -- Gitee From 3dfdd2f9d0dbe73872effabbdd5671a2b9f2b74c Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:21 +0000 Subject: [PATCH 04/10] sched/psi: introduce tracepoints for psi_memstall_{enter, leave} hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW -------------------------------- Two tracepoints are added we can easily use other tools such as ebpf, ftrace, perf to monitor the memstall data and do some analysis. The output of these tracepoints is, kcompactd0-58 [001] .... 902.709565: psi_memstall_enter: kcompactd kswapd0-132 [003] .... 902.709569: psi_memstall_leave: balance_pgdat kcompactd0-58 [001] .... 902.775230: psi_memstall_leave: kcompactd kswapd0-132 [003] .... 1337.754598: psi_memstall_enter: balance_pgdat kswapd0-132 [003] .... 1337.756076: psi_memstall_leave: balance_pgdat kcompactd0-58 [003] .... 1337.756213: psi_memstall_enter: kcompactd kcompactd0-58 [003] .... 1337.893188: psi_memstall_leave: kcompactd Signed-off-by: Lu Jialin --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/psi.c | 6 ++++++ 2 files changed, 33 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 4bafb70dfafc..ee7a37e72257 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -790,6 +790,33 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_EVENT_CLASS(psi_memstall_template, + + TP_PROTO(unsigned long function), + + TP_ARGS(function), + + TP_STRUCT__entry( + __field(unsigned long, function) + ), + + TP_fast_assign( + __entry->function = function; + ), + + TP_printk("%ps", (void *)__entry->function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_enter, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_leave, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index eb673c846a61..005813dbf45d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,6 +137,8 @@ * sampling of the aggregate task states would be. */ +#include + static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -1080,6 +1082,8 @@ void psi_memstall_enter(unsigned long *flags) *flags = current->in_memstall; if (*flags) return; + + trace_psi_memstall_enter(_RET_IP_); /* * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can @@ -1110,6 +1114,8 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; + + trace_psi_memstall_leave(_RET_IP_); /* * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could -- Gitee From a65983d90bfb5e031444fea492b32f931c83ffcf Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:11 +0000 Subject: [PATCH 05/10] sched/psi: Introduce fine grained stall time collect for cgroup reclaim hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- PSI will tracking pressure stall for memory, cpu, io and irq. But, there are differrnt pressure types which will cause memory pressure, memory.pressure could not show the type of pressure effectively. The same situation for cpu.pressure. Introduce pressure.stat in psi, which will monitor specific reasons for the memory.pressure and cpu.pressure, such as global/cgroup memory reclaim, memory compact, cpu cfs bandwidth and so on. Therefore, userland could make the right solution to reduce the pressure depends on the specific pressure reasons. This patch will introduce memory fine grained stall time collect for cgroup reclaim. Signed-off-by: Lu Jialin --- block/blk-cgroup.c | 2 +- fs/btrfs/compression.c | 2 +- fs/erofs/zdata.c | 2 +- include/linux/psi_types.h | 38 ++++++++++++ include/linux/sched.h | 3 + init/Kconfig | 10 ++++ kernel/sched/psi.c | 121 +++++++++++++++++++++++++++++++++++++- mm/compaction.c | 2 +- mm/filemap.c | 6 +- mm/memcontrol.c | 4 ++ mm/page_alloc.c | 2 + mm/page_io.c | 1 + mm/readahead.c | 6 +- mm/vmscan.c | 3 +- 14 files changed, 190 insertions(+), 12 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a1460948f663..a1f2f316e88d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1834,7 +1834,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { - unsigned long pflags; + unsigned long pflags = 0; bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8818ed5c390f..9b5cea00238c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -475,7 +475,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) u64 em_len; u64 em_start; struct extent_map *em; - unsigned long pflags; + unsigned long pflags = 0; int memstall = 0; blk_status_t ret; int ret2; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a7e6847f6f8f..7b0cbe37e462 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1636,7 +1636,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; - unsigned long pflags; + unsigned long pflags = 0; int memstall = 0; /* diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index f1fd3a8044e0..35b73f66bf7f 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -81,6 +81,20 @@ enum psi_aggregators { NR_PSI_AGGREGATORS, }; +#ifdef CONFIG_PSI_FINE_GRAINED +enum psi_stat_states { + PSI_MEMCG_RECLAIM_SOME, + PSI_MEMCG_RECLAIM_FULL, + NR_PSI_STAT_STATES, +}; + +enum psi_stat_task_count { + NR_MEMCG_RECLAIM, + NR_MEMCG_RECLAIM_RUNNING, + NR_PSI_STAT_TASK_COUNTS, +}; +#endif /* CONFIG_PSI_FINE_GRAINED */ + struct psi_group_cpu { /* 1st cacheline updated by the scheduler */ @@ -104,6 +118,13 @@ struct psi_group_cpu { /* Delta detection against the sampling buckets */ u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] ____cacheline_aligned_in_smp; + +#ifdef CONFIG_PSI_FINE_GRAINED + CACHELINE_PADDING(_pad1_); + u32 fine_grained_state_mask; + u32 fine_grained_times[NR_PSI_STAT_STATES]; + unsigned int fine_grained_tasks[NR_PSI_STAT_TASK_COUNTS]; +#endif }; /* PSI growth tracking window */ @@ -215,4 +236,21 @@ struct psi_group { }; #endif /* CONFIG_PSI */ +#ifdef CONFIG_PSI_FINE_GRAINED +/* + * one type should have two task stats: regular running and memstall + * threads. The reason is the same as NR_MEMSTALL_RUNNING. + * Because of the psi_memstall_type is start with 1, the correspondence + * between psi_memstall_type and psi_stat_task_count should be as below: + * + * memstall : psi_memstall_type * 2 - 2; + * running : psi_memstall_type * 2 - 1; + */ +enum psi_memstall_type { + PSI_MEMCG_RECLAIM = 1, +}; +#else +#define PSI_MEMCG_RECLAIM 0 +#endif /* CONFIG_PSI_FINE_GRAINED */ + #endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fdd08aa9626..5a20ae312121 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1552,6 +1552,9 @@ struct task_struct { const cpumask_t *select_cpus; #endif +#ifdef CONFIG_PSI_FINE_GRAINED + int memstall_type; +#endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/init/Kconfig b/init/Kconfig index 45aea89c163e..e34bd3c436e9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -677,6 +677,16 @@ config PSI_CGROUP_V1 Say N if unsure. +config PSI_FINE_GRAINED + bool "Support fine grained psi under cgroup v1 and system" + default n + depends on PSI + help + If set, fine grained pressure stall information tracking will + be used for cgroup v1 and system, such as memory reclaim, + memory compact and so on. + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 005813dbf45d..be896eb94f74 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -336,6 +336,90 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } +#ifdef CONFIG_PSI_FINE_GRAINED + +static void record_stat_times(struct psi_group_cpu *groupc, u32 delta) +{ + if (groupc->fine_grained_state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) { + groupc->fine_grained_times[PSI_MEMCG_RECLAIM_SOME] += delta; + if (groupc->fine_grained_state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) + groupc->fine_grained_times[PSI_MEMCG_RECLAIM_FULL] += delta; + } +} + +static bool test_fine_grained_stat(unsigned int *stat_tasks, + unsigned int nr_running, + enum psi_stat_states state) +{ + switch (state) { + case PSI_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM]); + case PSI_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + default: + return false; + } +} + +static void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) +{ + int t; + u32 state_mask = 0; + enum psi_stat_states s; + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + for (t = 0; clear; clear &= ~(1 << t), t++) + if (clear & (1 << t)) + groupc->fine_grained_tasks[t]--; + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->fine_grained_tasks[t]++; + for (s = 0; s < NR_PSI_STAT_STATES; s++) + if (test_fine_grained_stat(groupc->fine_grained_tasks, + groupc->tasks[NR_RUNNING], s)) + state_mask |= (1 << s); + if (unlikely(groupc->state_mask & PSI_ONCPU) && + cpu_curr(cpu)->memstall_type) + state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1)); + + groupc->fine_grained_state_mask = state_mask; + write_seqcount_end(&groupc->seq); +} + +static void psi_stat_flags_change(struct task_struct *task, int *stat_set, + int *stat_clear, int set, int clear) +{ + if (!task->memstall_type) + return; + + if (clear) { + if (clear & TSK_MEMSTALL) + *stat_clear |= 1 << (2 * task->memstall_type - 2); + if (clear & TSK_MEMSTALL_RUNNING) + *stat_clear |= 1 << (2 * task->memstall_type - 1); + } + if (set) { + if (set & TSK_MEMSTALL) + *stat_set |= 1 << (2 * task->memstall_type - 2); + if (set & TSK_MEMSTALL_RUNNING) + *stat_set |= 1 << (2 * task->memstall_type - 1); + } + if (!task->in_memstall) + task->memstall_type = 0; +} + +#else +static inline void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) {} +static inline void psi_stat_flags_change(struct task_struct *task, + int *stat_set, int *stat_clear, + int set, int clear) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -776,6 +860,10 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; + +#ifdef CONFIG_PSI_FINE_GRAINED + record_stat_times(groupc, delta); +#endif } static void psi_group_change(struct psi_group *group, int cpu, @@ -937,17 +1025,21 @@ void psi_task_change(struct task_struct *task, int clear, int set) int cpu = task_cpu(task); struct psi_group *group; u64 now; + int stat_set = 0; + int stat_clear = 0; if (!task->pid) return; psi_flags_change(task, clear, set); + psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear); now = cpu_clock(cpu); group = task_psi_group(task); do { psi_group_change(group, cpu, clear, set, now, true); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); } @@ -974,12 +1066,16 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_stat_change(group, cpu, 0, 0); } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; bool wake_clock = true; + int stat_set = 0; + int stat_clear = 0; + bool memstall_type_change = false; /* * When we're going to sleep, psi_dequeue() lets us @@ -1006,24 +1102,34 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_flags_change(prev, clear, set); + psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear); group = task_psi_group(prev); do { if (group == common) break; psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); +#ifdef CONFIG_PSI_FINE_GRAINED + if (next->memstall_type != prev->memstall_type) + memstall_type_change = true; +#endif + /* * TSK_ONCPU is handled up to the common ancestor. If there are * any other differences between the two tasks (e.g. prev goes * to sleep, or only one task is memstall), finish propagating * those differences all the way up to the root. */ - if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU || + memstall_type_change) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) + for (; group; group = group->parent) { psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); + } } } } @@ -1066,7 +1172,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) /** * psi_memstall_enter - mark the beginning of a memory stall section - * @flags: flags to handle nested sections + * @flags: flags to handle nested sections. When the memory pressure + * is stalled in pressure_stat, the flags will be the pressure source, + * Otherwise, the init flags should be zero. * * Marks the calling task as being stalled due to a lack of memory, * such as waiting for a refault or performing reclaim. @@ -1075,6 +1183,9 @@ void psi_memstall_enter(unsigned long *flags) { struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_PSI_FINE_GRAINED + unsigned long stat_flags = *flags; +#endif if (static_branch_likely(&psi_disabled)) return; @@ -1092,6 +1203,10 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; +#ifdef CONFIG_PSI_FINE_GRAINED + if (stat_flags) + current->memstall_type = stat_flags; +#endif psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); diff --git a/mm/compaction.c b/mm/compaction.c index 38c8d216c6a3..771e9629b95c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3061,7 +3061,7 @@ static int kcompactd(void *p) pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { - unsigned long pflags; + unsigned long pflags = 0; /* * Avoid the unnecessary wakeup for proactive compaction diff --git a/mm/filemap.c b/mm/filemap.c index 1c398edcfcaf..d0a2beabc68a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1227,7 +1227,7 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; if (bit_nr == PG_locked && @@ -1378,7 +1378,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); @@ -2366,7 +2366,7 @@ static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); - unsigned long pflags; + unsigned long pflags = 0; int error; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e80504a49c0..ec33cf28f27e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2421,6 +2421,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, memcg_memory_event(memcg, MEMCG_HIGH); + pflags = PSI_MEMCG_RECLAIM; psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, @@ -2458,6 +2459,7 @@ static void async_reclaim_high(struct mem_cgroup *memcg) return; } + pflags = 0; psi_memstall_enter(&pflags); nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : MEMCG_CHARGE_BATCH; @@ -2692,6 +2694,7 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. */ + pflags = PSI_MEMCG_RECLAIM; psi_memstall_enter(&pflags); schedule_timeout_killable(penalty_jiffies); psi_memstall_leave(&pflags); @@ -2753,6 +2756,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, memcg_memory_event(mem_over_limit, MEMCG_MAX); raised_max_event = true; + pflags = PSI_MEMCG_RECLAIM; psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, reclaim_options); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5b61c1060d1..83ccf40e8bda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3518,6 +3518,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; + pflags = 0; psi_memstall_enter(&pflags); delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); @@ -3787,6 +3788,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long pflags; bool drained = false; + pflags = 0; psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) diff --git a/mm/page_io.c b/mm/page_io.c index fe4c21af23f2..9230f4fac3ce 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -509,6 +509,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) */ if (workingset) { delayacct_thrashing_start(&in_thrashing); + pflags = 0; psi_memstall_enter(&pflags); } delayacct_swapin_start(); diff --git a/mm/readahead.c b/mm/readahead.c index 6925e6959fd3..c8d468e6253c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -152,8 +152,10 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; - if (unlikely(rac->_workingset)) + if (unlikely(rac->_workingset)) { + rac->_pflags = 0; psi_memstall_enter(&rac->_pflags); + } blk_start_plug(&plug); if (aops->readahead) { @@ -803,6 +805,7 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; + ractl->_pflags = 0; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; @@ -830,6 +833,7 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; + ractl->_pflags = 0; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a676296af30..1f2ccc5d7e99 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7393,7 +7393,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long pflags; + unsigned long pflags = 0; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; @@ -8064,6 +8064,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in sc.gfp_mask); cond_resched(); + pflags = 0; psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* -- Gitee From 0df60cce43301df24dd5fa8bedaea5feb10c87d1 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:12 +0000 Subject: [PATCH 06/10] sched/psi: Introduce avgs and total calculation for cgroup reclaim hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce avgs and total calculation depend on the fine grained time collect in psi_avgs_works() for cgroup_reclaim. The results will be shown in pressure.stat, which will be done in the next patch. Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 10 ++++++++- kernel/sched/psi.c | 45 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 35b73f66bf7f..b9a25925ccb5 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -124,6 +124,8 @@ struct psi_group_cpu { u32 fine_grained_state_mask; u32 fine_grained_times[NR_PSI_STAT_STATES]; unsigned int fine_grained_tasks[NR_PSI_STAT_TASK_COUNTS]; + u32 fine_grained_times_delta; + u32 fine_grained_times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; #endif }; @@ -226,8 +228,14 @@ struct psi_group { u64 rtpoll_total[NR_PSI_STATES - 1]; u64 rtpoll_next_update; u64 rtpoll_until; +#ifdef CONFIG_PSI_FINE_GRAINED + /* Running fine grained pressure averages */ + u64 fine_grained_avg_total[NR_PSI_STAT_STATES]; + /* Total fine grained stall times and sampled pressure averages */ + u64 fine_grained_total[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + unsigned long fine_grained_avg[NR_PSI_STAT_STATES][3]; +#endif }; - #else /* CONFIG_PSI */ #define NR_PSI_RESOURCES 0 diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index be896eb94f74..545f892fea5a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -293,6 +293,10 @@ static void get_recent_times(struct psi_group *group, int cpu, *pchanged_states |= (1 << s); } +#ifdef CONFIG_PSI_FINE_GRAINED + groupc->fine_grained_times_delta = now - state_start; +#endif + /* * When collect_percpu_times() from the avgs_work, we don't want to * re-arm avgs_work when all CPUs are IDLE. But the current CPU running @@ -412,6 +416,23 @@ static void psi_stat_flags_change(struct task_struct *task, int *stat_set, task->memstall_type = 0; } +static void get_recent_stat_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u64 *stat_delta, u64 nonidle) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + u32 times[NR_PSI_STAT_STATES] = {0}; + enum psi_stat_states s; + u32 delta; + + memcpy(times, groupc->fine_grained_times, sizeof(groupc->fine_grained_times)); + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + if (groupc->fine_grained_state_mask & (1 << s)) + times[s] += groupc->fine_grained_times_delta; + delta = times[s] - groupc->fine_grained_times_prev[aggregator][s]; + groupc->fine_grained_times_prev[aggregator][s] = times[s]; + stat_delta[s] += (u64)delta * nonidle; + } +} #else static inline void psi_group_stat_change(struct psi_group *group, int cpu, int clear, int set) {} @@ -424,6 +445,9 @@ static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + u64 stat_delta[NR_PSI_STAT_STATES] = { 0 }; +#endif u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; u32 changed_states = 0; @@ -452,6 +476,9 @@ static void collect_percpu_times(struct psi_group *group, for (s = 0; s < PSI_NONIDLE; s++) deltas[s] += (u64)times[s] * nonidle; +#ifdef CONFIG_PSI_FINE_GRAINED + get_recent_stat_times(group, cpu, aggregator, stat_delta, nonidle); +#endif } /* @@ -471,6 +498,12 @@ static void collect_percpu_times(struct psi_group *group, group->total[aggregator][s] += div_u64(deltas[s], max(nonidle_total, 1UL)); +#ifdef CONFIG_PSI_FINE_GRAINED + for (s = 0; s < NR_PSI_STAT_STATES; s++) + group->fine_grained_total[aggregator][s] += + div_u64(stat_delta[s], max(nonidle_total, 1UL)); +#endif + if (pchanged_states) *pchanged_states = changed_states; } @@ -643,6 +676,18 @@ static u64 update_averages(struct psi_group *group, u64 now) group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } +#ifdef CONFIG_PSI_FINE_GRAINED + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + u32 stat_sample; + + stat_sample = group->fine_grained_total[PSI_AVGS][s] - + group->fine_grained_avg_total[s]; + if (stat_sample > period) + stat_sample = period; + group->fine_grained_avg_total[s] += stat_sample; + calc_avgs(group->fine_grained_avg[s], missed_periods, stat_sample, period); + } +#endif return avg_next_update; } -- Gitee From 4a69b56540f40550585b9f93c35e415ac90bdecc Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:13 +0000 Subject: [PATCH 07/10] sched/psi: Introduce pressure.stat in psi hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce pressure.stat in psi for cgroupv1 and system, which will show the fine grained time tracking for cgroup memory reclaim. for example: /test # cat /tmp/cpuacct/test/pressure.stat cgroup_memory_reclaim some avg10=45.78 avg60=10.40 avg300=2.26 total=13491160 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 Signed-off-by: Lu Jialin --- include/linux/psi.h | 4 +++ kernel/cgroup/cgroup.c | 12 +++++++++ kernel/sched/psi.c | 61 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/include/linux/psi.h b/include/linux/psi.h index 9e5d49cf62d5..7236b888b040 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -34,6 +34,10 @@ void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +#ifdef CONFIG_PSI_FINE_GRAINED +int psi_stat_show(struct seq_file *s, struct psi_group *group); +#endif + #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 24a657d130c1..7ee2260c1b9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3920,6 +3920,13 @@ bool cgroup_psi_enabled(void) } #ifdef CONFIG_PSI_CGROUP_V1 +#ifdef CONFIG_PSI_FINE_GRAINED +static int cgroup_psi_stat_show(struct seq_file *seq, void *v) +{ + return psi_stat_show(seq, cgroup_psi(seq_css(seq)->cgroup)); +} +#endif + struct cftype cgroup_v1_psi_files[] = { { .name = "io.pressure", @@ -3955,6 +3962,11 @@ struct cftype cgroup_v1_psi_files[] = { .release = cgroup_pressure_release, }, #endif + { + .name = "pressure.stat", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_psi_stat_show, + }, { } /* terminate */ }; #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 545f892fea5a..cec022b09858 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1814,6 +1814,64 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_PSI_FINE_GRAINED +static const char *const psi_stat_names[] = { + "cgroup_memory_reclaim", +}; + +int psi_stat_show(struct seq_file *m, struct psi_group *group) +{ + int i; + u64 now; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); + + for (i = 0; i < NR_PSI_STAT_STATES; i++) { + unsigned long avg[3] = {0, }; + int w; + u64 total; + bool is_full = i % 2; + + for (w = 0; w < 3; w++) + avg[w] = group->fine_grained_avg[i][w]; + total = div_u64(group->fine_grained_total[PSI_AVGS][i], NSEC_PER_USEC); + if (!is_full) + seq_printf(m, "%s\n", psi_stat_names[i / 2]); + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + is_full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + return 0; +} +static int system_psi_stat_show(struct seq_file *m, void *v) +{ + return psi_stat_show(m, &psi_system); +} + +static int psi_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, system_psi_stat_show, NULL); +} + +static const struct proc_ops psi_stat_proc_ops = { + .proc_open = psi_stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = psi_fop_release, +}; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int psi_irq_show(struct seq_file *m, void *v) { @@ -1850,6 +1908,9 @@ static int __init psi_proc_init(void) proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); #ifdef CONFIG_IRQ_TIME_ACCOUNTING proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif +#ifdef CONFIG_PSI_FINE_GRAINED + proc_create("pressure/stat", 0666, NULL, &psi_stat_proc_ops); #endif } return 0; -- Gitee From 25d00f6853c3c6c5a29f3be8b05be1c03363c0ab Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:14 +0000 Subject: [PATCH 08/10] sched/psi: add more memory fine grained stall tracking in pressure.stat hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introcude more memory fine grianed stall tracking in pressure.stat, such as global memory relcaim, memory compact, memory async cgroup reclaim and swap. Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 24 +++++++++++++++++++++ kernel/sched/psi.c | 44 +++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 6 files changed, 73 insertions(+), 5 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index b9a25925ccb5..7fa21afaa844 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -85,12 +85,28 @@ enum psi_aggregators { enum psi_stat_states { PSI_MEMCG_RECLAIM_SOME, PSI_MEMCG_RECLAIM_FULL, + PSI_GLOBAL_RECLAIM_SOME, + PSI_GLOBAL_RECLAIM_FULL, + PSI_COMPACT_SOME, + PSI_COMPACT_FULL, + PSI_ASYNC_MEMCG_RECLAIM_SOME, + PSI_ASYNC_MEMCG_RECLAIM_FULL, + PSI_SWAP_SOME, + PSI_SWAP_FULL, NR_PSI_STAT_STATES, }; enum psi_stat_task_count { NR_MEMCG_RECLAIM, NR_MEMCG_RECLAIM_RUNNING, + NR_GLOBAL_RECLAIM, + NR_GLOBAL_RECLAIM_RUNNING, + NR_COMPACT, + NR_COMPACT_RUNNING, + NR_ASYNC_MEMCG_RECLAIM, + NR_ASYNC_MEMCG_RECLAIM_RUNNING, + NR_SWAP, + NR_SWAP_RUNNING, NR_PSI_STAT_TASK_COUNTS, }; #endif /* CONFIG_PSI_FINE_GRAINED */ @@ -256,9 +272,17 @@ struct psi_group { }; */ enum psi_memstall_type { PSI_MEMCG_RECLAIM = 1, + PSI_GLOBAL_RECLAIM, + PSI_COMPACT, + PSI_ASYNC_MEMCG_RECLAIM, + PSI_SWAP, }; #else #define PSI_MEMCG_RECLAIM 0 +#define PSI_GLOBAL_RECLAIM 0 +#define PSI_COMPACT 0 +#define PSI_ASYNC_MEMCG_RECLAIM 0 +#define PSI_SWAP 0 #endif /* CONFIG_PSI_FINE_GRAINED */ #endif /* _LINUX_PSI_TYPES_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index cec022b09858..7f8bb7bd4820 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -349,6 +349,26 @@ static void record_stat_times(struct psi_group_cpu *groupc, u32 delta) if (groupc->fine_grained_state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) groupc->fine_grained_times[PSI_MEMCG_RECLAIM_FULL] += delta; } + if (groupc->fine_grained_state_mask & (1 << PSI_GLOBAL_RECLAIM_SOME)) { + groupc->fine_grained_times[PSI_GLOBAL_RECLAIM_SOME] += delta; + if (groupc->fine_grained_state_mask & (1 << PSI_GLOBAL_RECLAIM_FULL)) + groupc->fine_grained_times[PSI_GLOBAL_RECLAIM_FULL] += delta; + } + if (groupc->fine_grained_state_mask & (1 << PSI_COMPACT_SOME)) { + groupc->fine_grained_times[PSI_COMPACT_SOME] += delta; + if (groupc->fine_grained_state_mask & (1 << PSI_COMPACT_FULL)) + groupc->fine_grained_times[PSI_COMPACT_FULL] += delta; + } + if (groupc->fine_grained_state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_SOME)) { + groupc->fine_grained_times[PSI_ASYNC_MEMCG_RECLAIM_SOME] += delta; + if (groupc->fine_grained_state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_FULL)) + groupc->fine_grained_times[PSI_ASYNC_MEMCG_RECLAIM_FULL] += delta; + } + if (groupc->fine_grained_state_mask & (1 << PSI_SWAP_SOME)) { + groupc->fine_grained_times[PSI_SWAP_SOME] += delta; + if (groupc->fine_grained_state_mask & (1 << PSI_SWAP_FULL)) + groupc->fine_grained_times[PSI_SWAP_FULL] += delta; + } } static bool test_fine_grained_stat(unsigned int *stat_tasks, @@ -361,6 +381,26 @@ static bool test_fine_grained_stat(unsigned int *stat_tasks, case PSI_MEMCG_RECLAIM_FULL: return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + case PSI_GLOBAL_RECLAIM_SOME: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM]); + case PSI_GLOBAL_RECLAIM_FULL: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM] && + nr_running == stat_tasks[NR_GLOBAL_RECLAIM_RUNNING]); + case PSI_COMPACT_SOME: + return unlikely(stat_tasks[NR_COMPACT]); + case PSI_COMPACT_FULL: + return unlikely(stat_tasks[NR_COMPACT] && + nr_running == stat_tasks[NR_COMPACT_RUNNING]); + case PSI_ASYNC_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM]); + case PSI_ASYNC_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_ASYNC_MEMCG_RECLAIM_RUNNING]); + case PSI_SWAP_SOME: + return unlikely(stat_tasks[NR_SWAP]); + case PSI_SWAP_FULL: + return unlikely(stat_tasks[NR_SWAP] && + nr_running == stat_tasks[NR_SWAP_RUNNING]); default: return false; } @@ -1817,6 +1857,10 @@ static const struct proc_ops psi_cpu_proc_ops = { #ifdef CONFIG_PSI_FINE_GRAINED static const char *const psi_stat_names[] = { "cgroup_memory_reclaim", + "global_memory_reclaim", + "compact", + "cgroup_async_memory_reclaim", + "swap", }; int psi_stat_show(struct seq_file *m, struct psi_group *group) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ec33cf28f27e..858a4df4d659 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2459,7 +2459,7 @@ static void async_reclaim_high(struct mem_cgroup *memcg) return; } - pflags = 0; + pflags = PSI_ASYNC_MEMCG_RECLAIM; psi_memstall_enter(&pflags); nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : MEMCG_CHARGE_BATCH; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83ccf40e8bda..65fd05d6fa57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3518,7 +3518,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; - pflags = 0; + pflags = PSI_COMPACT; psi_memstall_enter(&pflags); delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); @@ -3788,7 +3788,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long pflags; bool drained = false; - pflags = 0; + pflags = PSI_GLOBAL_RECLAIM; psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) diff --git a/mm/page_io.c b/mm/page_io.c index 9230f4fac3ce..8fa62c6ba517 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -509,7 +509,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) */ if (workingset) { delayacct_thrashing_start(&in_thrashing); - pflags = 0; + pflags = PSI_SWAP; psi_memstall_enter(&pflags); } delayacct_swapin_start(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f2ccc5d7e99..31fa473e4611 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8064,7 +8064,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in sc.gfp_mask); cond_resched(); - pflags = 0; + pflags = PSI_GLOBAL_RECLAIM; psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* -- Gitee From 654944510822988390470cbc5b6f914c19dd9b88 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:15 +0000 Subject: [PATCH 09/10] sched/psi: add cpu fine grained stall tracking in pressure.stat hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce cpu fine grained stall tracking(cpu cfs bandwidth or cpu qos) in pressure.stat. For cpu fine grained stall tracking, only "full" information in pressure.stat. for example: /test # cat /tmp/cpuacct/test/pressure.stat cgroup_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 global_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 compact some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cgroup_async_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 swap some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cpu_cfs_bandwidth full avg10=21.76 avg60=4.58 avg300=0.98 total=3893827 cpu_qos full avg10=0.00 avg60=0.00 avg300=0.00 total=0 Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 8 +++++ kernel/sched/fair.c | 6 ---- kernel/sched/psi.c | 62 +++++++++++++++++++++++++++++++++++---- kernel/sched/stats.h | 8 +++++ 4 files changed, 73 insertions(+), 11 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 7fa21afaa844..3a72e6def0f7 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -82,6 +82,8 @@ enum psi_aggregators { }; #ifdef CONFIG_PSI_FINE_GRAINED +#define CPU_CFS_BANDWIDTH 1 + enum psi_stat_states { PSI_MEMCG_RECLAIM_SOME, PSI_MEMCG_RECLAIM_FULL, @@ -93,6 +95,10 @@ enum psi_stat_states { PSI_ASYNC_MEMCG_RECLAIM_FULL, PSI_SWAP_SOME, PSI_SWAP_FULL, + PSI_CPU_CFS_BANDWIDTH_FULL, +#ifdef CONFIG_QOS_SCHED + PSI_CPU_QOS_FULL, +#endif NR_PSI_STAT_STATES, }; @@ -142,6 +148,8 @@ struct psi_group_cpu { unsigned int fine_grained_tasks[NR_PSI_STAT_TASK_COUNTS]; u32 fine_grained_times_delta; u32 fine_grained_times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + int prev_throttle; + int cur_throttle; #endif }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318258ea011e..df2032c75a72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -139,12 +139,6 @@ int __weak arch_asym_cpu_priority(int cpu) #ifdef CONFIG_QOS_SCHED -/* - * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled - * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). - */ -#define QOS_THROTTLED 2 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7f8bb7bd4820..15192a436dd0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -369,6 +369,16 @@ static void record_stat_times(struct psi_group_cpu *groupc, u32 delta) if (groupc->fine_grained_state_mask & (1 << PSI_SWAP_FULL)) groupc->fine_grained_times[PSI_SWAP_FULL] += delta; } +#ifdef CONFIG_CFS_BANDWIDTH + if (groupc->state_mask & (1 << PSI_CPU_FULL)) { + if (groupc->prev_throttle == CPU_CFS_BANDWIDTH) + groupc->fine_grained_times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta; +#ifdef CONFIG_QOS_SCHED + else if (groupc->prev_throttle == QOS_THROTTLED) + groupc->fine_grained_times[PSI_CPU_QOS_FULL] += delta; + } +#endif +#endif } static bool test_fine_grained_stat(unsigned int *stat_tasks, @@ -422,7 +432,7 @@ static void psi_group_stat_change(struct psi_group *group, int cpu, for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) groupc->fine_grained_tasks[t]++; - for (s = 0; s < NR_PSI_STAT_STATES; s++) + for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++) if (test_fine_grained_stat(groupc->fine_grained_tasks, groupc->tasks[NR_RUNNING], s)) state_mask |= (1 << s); @@ -481,6 +491,32 @@ static inline void psi_stat_flags_change(struct task_struct *task, int set, int clear) {} #endif +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \ + defined(CONFIG_PSI_FINE_GRAINED) +static void update_throttle_type(struct task_struct *task, int cpu, bool next) +{ + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + struct cgroup *cpuacct_cgrp; + struct psi_group_cpu *groupc; + struct task_group *tsk_grp; + + rcu_read_lock(); + cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id); + if (cgroup_parent(cpuacct_cgrp)) { + groupc = per_cpu_ptr(cgroup_psi(cpuacct_cgrp)->pcpu, cpu); + tsk_grp = task_group(task); + if (next) + groupc->prev_throttle = groupc->cur_throttle; + groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled; + } + rcu_read_unlock(); + } +} +#else +static inline void update_throttle_type(struct task_struct *task, int cpu, + bool next) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -1019,8 +1055,9 @@ static void psi_group_change(struct psi_group *group, int cpu, * may have already incorporated the live state into times_prev; * avoid a delta sample underflow when PSI is later re-enabled. */ - if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) { record_times(groupc, now); + } groupc->state_mask = state_mask; @@ -1136,6 +1173,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { + update_throttle_type(next, cpu, true); psi_flags_change(next, 0, TSK_ONCPU); /* * Set TSK_ONCPU on @next's cgroups. If @next shares any @@ -1162,6 +1200,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int stat_clear = 0; bool memstall_type_change = false; + update_throttle_type(prev, cpu, false); /* * When we're going to sleep, psi_dequeue() lets us * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and @@ -1861,8 +1900,22 @@ static const char *const psi_stat_names[] = { "compact", "cgroup_async_memory_reclaim", "swap", + "cpu_cfs_bandwidth", + "cpu_qos", }; +static void get_stat_names(struct seq_file *m, int i, bool is_full) +{ + if (i <= PSI_SWAP_FULL && !is_full) + return seq_printf(m, "%s\n", psi_stat_names[i / 2]); + else if (i == PSI_CPU_CFS_BANDWIDTH_FULL) + return seq_printf(m, "%s\n", "cpu_cfs_bandwidth"); +#ifdef CONFIG_QOS_SCHED + else if (i == PSI_CPU_QOS_FULL) + return seq_printf(m, "%s\n", "cpu_qos"); +#endif +} + int psi_stat_show(struct seq_file *m, struct psi_group *group) { int i; @@ -1882,13 +1935,12 @@ int psi_stat_show(struct seq_file *m, struct psi_group *group) unsigned long avg[3] = {0, }; int w; u64 total; - bool is_full = i % 2; + bool is_full = i % 2 || i > PSI_SWAP_FULL; for (w = 0; w < 3; w++) avg[w] = group->fine_grained_avg[i][w]; total = div_u64(group->fine_grained_total[PSI_AVGS][i], NSEC_PER_USEC); - if (!is_full) - seq_printf(m, "%s\n", psi_stat_names[i / 2]); + get_stat_names(m, i, is_full); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", is_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ccc1f120d67..a6d7206d969d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -126,6 +126,14 @@ __schedstats_from_se(struct sched_entity *se) return &task_of(se)->stats; } +#ifdef CONFIG_QOS_SCHED +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 +#endif + #ifdef CONFIG_PSI void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, -- Gitee From c36bfe97a2529c96d8169b8cd10ec88de1cd138b Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 4 Jan 2024 07:33:16 +0000 Subject: [PATCH 10/10] sched/psi: enable PSI_CGROUP_V1 and PSI_FINE_GRAINED in openeuler_defconfig hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- enable CONFIG_PSI_CGROUP_V1 and CONFIG_PSI_FINE_GRAINED in openeuler_defconfig Signed-off-by: Lu Jialin --- arch/arm64/configs/openeuler_defconfig | 2 ++ arch/x86/configs/openeuler_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8e29cb161800..de3b2a5069ec 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -112,6 +112,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b3e32559ab62..847bdf4c4d3e 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -132,6 +132,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y -- Gitee