diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358727b5b9b184761e1d11b1332fcca..3f68499b4ac40fbcc830aed9bfb9419a9cc67678 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4652,6 +4652,10 @@ tracking. Format: + psi_v1= [KNL] Enable or disable pressure stall information + tracking on cgroup v1. + Format: + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 33ba397118848b915123f43d26e03439cda4af23..6379a42fe312fc0495114009890bfac05ea7423b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -112,6 +112,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 44040b835333c8a85fa7472f393ecc3275fcaea0..aa31554766ec3d6a41b8f4a6d1ed0785baa8aea7 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -132,6 +132,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4a42ea2972ad85693480c5e9c0e9599923c73ccf..eb5bc214a9669e2fd3d2f5a47c1fc566b4960924 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1831,7 +1831,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { - unsigned long pflags; + unsigned long pflags = 0; bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; diff --git a/include/linux/psi.h b/include/linux/psi.h index e0745873e3f26cf4c471aac72c0e1ed373fbb662..a01e5b857ba5648bf5a05002ed392f456b23ff53 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -15,6 +15,9 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +#ifdef CONFIG_PSI_CGROUP_V1 +extern struct static_key_true psi_v1_disabled; +#endif extern struct psi_group psi_system; void psi_init(void); @@ -31,6 +34,10 @@ void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +#ifdef CONFIG_PSI_FINE_GRAINED + int psi_stat_show(struct seq_file *s, struct psi_group *group); +#endif + #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index f1fd3a8044e0eca4fcea4487243bd6b0e438aaf4..bd2a28224910c78684fcd13de97b22e2abd5fc97 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -207,6 +207,66 @@ struct psi_group { u64 rtpoll_until; }; +#ifdef CONFIG_PSI_FINE_GRAINED + +enum psi_stat_states { + PSI_MEMCG_RECLAIM_SOME, + PSI_MEMCG_RECLAIM_FULL, + PSI_GLOBAL_RECLAIM_SOME, + PSI_GLOBAL_RECLAIM_FULL, + PSI_COMPACT_SOME, + PSI_COMPACT_FULL, + PSI_ASYNC_MEMCG_RECLAIM_SOME, + PSI_ASYNC_MEMCG_RECLAIM_FULL, + PSI_SWAP_SOME, + PSI_SWAP_FULL, + PSI_CPU_CFS_BANDWIDTH_FULL, +#ifdef CONFIG_QOS_SCHED + PSI_CPU_QOS_FULL, +#endif + NR_PSI_STAT_STATES, +}; + +enum psi_stat_task_count { + NR_MEMCG_RECLAIM, + NR_MEMCG_RECLAIM_RUNNING, + NR_GLOBAL_RECLAIM, + NR_GLOBAL_RECLAIM_RUNNING, + NR_COMPACT, + NR_COMPACT_RUNNING, + NR_ASYNC_MEMCG_RECLAIM, + NR_ASYNC_MEMCG_RECLAIM_RUNNING, + NR_SWAP, + NR_SWAP_RUNNING, + NR_PSI_STAT_TASK_COUNTS, +}; + +#define CPU_CFS_BANDWIDTH 1 + +struct psi_group_stat_cpu { + u32 state_mask; + u32 times[NR_PSI_STAT_STATES]; + u32 psi_delta; + unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; + u32 times_delta; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + int prev_throttle; + int cur_throttle; +}; + +struct psi_group_ext { + struct psi_group psi; + struct psi_group_stat_cpu __percpu *pcpu; + /* Running fine grained pressure averages */ + u64 avg_total[NR_PSI_STAT_STATES]; + /* Total fine grained stall times and sampled pressure averages */ + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + unsigned long avg[NR_PSI_STAT_STATES][3]; +}; +#else +struct psi_group_ext {}; +#endif /* CONFIG_PSI_FINE_GRAINED */ + #else /* CONFIG_PSI */ #define NR_PSI_RESOURCES 0 @@ -215,4 +275,21 @@ struct psi_group { }; #endif /* CONFIG_PSI */ +/* + * one type should have two task stats: regular running and memstall + * threads. The reason is the same as NR_MEMSTALL_RUNNING. + * Because of the psi_memstall_type is start with 1, the correspondence + * between psi_memstall_type and psi_stat_task_count should be as below: + * + * memstall : psi_memstall_type * 2 - 2; + * running : psi_memstall_type * 2 - 1; + */ +enum psi_memstall_type { + PSI_MEMCG_RECLAIM = 1, + PSI_GLOBAL_RECLAIM, + PSI_COMPACT, + PSI_ASYNC_MEMCG_RECLAIM, + PSI_SWAP, +}; + #endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3520e3fbaa916670190eea018a4a6a01f78d5010..b7014cd0122fc5880bd4077d33085c441069ac5b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1537,6 +1537,7 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif + int memstall_type; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714cbebb91ba9280ce44f812ece32de..0e8e7bd5cb9f449982605c836017a397b24ed0b0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -735,6 +735,33 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_EVENT_CLASS(psi_memstall_template, + + TP_PROTO(unsigned long function), + + TP_ARGS(function), + + TP_STRUCT__entry( + __field(unsigned long, function) + ), + + TP_fast_assign( + __entry->function = function; + ), + + TP_printk("%ps", (void *)__entry->function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_enter, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_leave, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81e09a3f66d3898aa685d91f94f711..91b3a2c2cea5c82f4183e199fbd6a6b27a25d570 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -667,6 +667,26 @@ config PSI_DEFAULT_DISABLED Say N if unsure. +config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default n + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + +config PSI_FINE_GRAINED + bool "Support fine grained psi under cgroup v1 and system" + default n + depends on PSI + help + If set, fine grained pressure stall information tracking will + be used for cgroup v1 and system, such as memory reclaim, + memory compact and so on. + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 41b16ce99f54d63c7b7c77edf3cd8e98ad49fba5..701e9c2b1d5df5584707d7da7896062e979a37cf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3902,6 +3902,56 @@ bool cgroup_psi_enabled(void) return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } +#ifdef CONFIG_PSI_CGROUP_V1 +#ifdef CONFIG_PSI_FINE_GRAINED +static int cgroup_psi_stat_show(struct seq_file *seq, void *v) +{ + return psi_stat_show(seq, cgroup_psi(seq_css(seq)->cgroup)); +} +#endif + +struct cftype cgroup_v1_psi_files[] = { + { + .name = "io.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), + .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "memory.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), + .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "cpu.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), + .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "pressure.stat", + .seq_show = cgroup_psi_stat_show, + }, + { } /* terminate */ +}; +#endif #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1c73dea1f778c4038fef05cab1875335bc3dd17..c377851e45fe014423693bf2190ee58eab15b87b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,7 +722,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); + if (irq_delta) + psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda099496fddb804f6ebb5a7fa0f275a72e5..0fd1b207f133b9d92516e8c1760e933163807a49 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -361,3 +361,31 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +#ifdef CONFIG_PSI +static bool psi_v1_enable; +extern struct cftype cgroup_v1_psi_files[]; + +static int __init setup_psi_v1(char *str) +{ + int ret; + + ret = kstrtobool(str, &psi_v1_enable); + if (psi_v1_enable) + static_branch_disable(&psi_v1_disabled); + + return ret == 0; +} +__setup("psi_v1=", setup_psi_v1); + +static int __init cgroup_v1_psi_init(void) +{ + if (!psi_v1_enable) + return 0; + + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); + return 0; +} + +late_initcall_sync(cgroup_v1_psi_init); +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0de55884f9dadb7fc976824b16c1ddfd054bf668..9245d35be87d174aa03884acb318b47743173fdb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -131,12 +131,6 @@ int __weak arch_asym_cpu_priority(int cpu) #ifdef CONFIG_QOS_SCHED -/* - * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled - * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). - */ -#define QOS_THROTTLED 2 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6e337e6ef29a65435707aed1b7aa0..6b232145742cb0d3e6c68d30452cf770c28d28f5 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,9 +137,12 @@ * sampling of the aggregate task states would be. */ +#include + static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_TRUE(psi_v1_disabled); static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED @@ -172,6 +175,27 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +#ifdef CONFIG_PSI_FINE_GRAINED +/* System-level fine grained pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu); +struct psi_group_ext psi_stat_system = { + .pcpu = &system_stat_group_pcpu, +}; + +struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + if (psi == &psi_system) + return &psi_stat_system; + else + return container_of(psi, struct psi_group_ext, psi); +} +#else +static inline struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + return NULL; +} +#endif + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -246,6 +270,10 @@ static void get_recent_times(struct psi_group *group, int cpu, enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); +#endif struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); int current_cpu = raw_smp_processor_id(); unsigned int tasks[NR_PSI_TASK_COUNTS]; @@ -290,6 +318,10 @@ static void get_recent_times(struct psi_group *group, int cpu, *pchanged_states |= (1 << s); } +#ifdef CONFIG_PSI_FINE_GRAINED + ext_groupc->times_delta = now - state_start; +#endif + /* * When collect_percpu_times() from the avgs_work, we don't want to * re-arm avgs_work when all CPUs are IDLE. But the current CPU running @@ -333,10 +365,240 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } +#ifdef CONFIG_PSI_FINE_GRAINED + +static void record_stat_times(struct psi_group_ext *psi_ext, int cpu) +{ + struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu); + + u32 delta = ext_grpc->psi_delta; + + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_SOME)) { + ext_grpc->times[PSI_GLOBAL_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_FULL)) + ext_grpc->times[PSI_GLOBAL_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_COMPACT_SOME)) { + ext_grpc->times[PSI_COMPACT_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_COMPACT_FULL)) + ext_grpc->times[PSI_COMPACT_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_SWAP_SOME)) { + ext_grpc->times[PSI_SWAP_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_SWAP_FULL)) + ext_grpc->times[PSI_SWAP_FULL] += delta; + } +} + +static bool test_fine_grained_stat(unsigned int *stat_tasks, + unsigned int nr_running, + enum psi_stat_states state) +{ + switch (state) { + case PSI_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM]); + case PSI_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + case PSI_GLOBAL_RECLAIM_SOME: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM]); + case PSI_GLOBAL_RECLAIM_FULL: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM] && + nr_running == stat_tasks[NR_GLOBAL_RECLAIM_RUNNING]); + case PSI_COMPACT_SOME: + return unlikely(stat_tasks[NR_COMPACT]); + case PSI_COMPACT_FULL: + return unlikely(stat_tasks[NR_COMPACT] && + nr_running == stat_tasks[NR_COMPACT_RUNNING]); + case PSI_ASYNC_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM]); + case PSI_ASYNC_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_ASYNC_MEMCG_RECLAIM_RUNNING]); + case PSI_SWAP_SOME: + return unlikely(stat_tasks[NR_SWAP]); + case PSI_SWAP_FULL: + return unlikely(stat_tasks[NR_SWAP] && + nr_running == stat_tasks[NR_SWAP_RUNNING]); + default: + return false; + } +} + +static void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) +{ + int t; + u32 state_mask = 0; + enum psi_stat_states s; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + record_stat_times(psi_ext, cpu); + + for (t = 0; clear; clear &= ~(1 << t), t++) + if (clear & (1 << t)) + ext_groupc->tasks[t]--; + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + ext_groupc->tasks[t]++; + for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++) + if (test_fine_grained_stat(ext_groupc->tasks, + groupc->tasks[NR_RUNNING], s)) + state_mask |= (1 << s); + if (unlikely(groupc->state_mask & PSI_ONCPU) && + cpu_curr(cpu)->memstall_type) + state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1)); + + ext_groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); +} + +static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + ext_groupc->psi_delta = now - groupc->state_start; +} + +static void psi_stat_flags_change(struct task_struct *task, int *stat_set, + int *stat_clear, int set, int clear) +{ + if (!task->memstall_type) + return; + + if (clear) { + if (clear & TSK_MEMSTALL) + *stat_clear |= 1 << (2 * task->memstall_type - 2); + if (clear & TSK_MEMSTALL_RUNNING) + *stat_clear |= 1 << (2 * task->memstall_type - 1); + } + if (set) { + if (set & TSK_MEMSTALL) + *stat_set |= 1 << (2 * task->memstall_type - 2); + if (set & TSK_MEMSTALL_RUNNING) + *stat_set |= 1 << (2 * task->memstall_type - 1); + } + if (!task->in_memstall) + task->memstall_type = 0; +} + +static void get_recent_stat_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + enum psi_stat_states s; + u32 delta; + + memcpy(times, ext_groupc->times, sizeof(ext_groupc->times)); + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + if (ext_groupc->state_mask & (1 << s)) + times[s] += ext_groupc->times_delta; + delta = times[s] - ext_groupc->times_prev[aggregator][s]; + ext_groupc->times_prev[aggregator][s] = times[s]; + times[s] = delta; + } +} + +static void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, u64 period) +{ + int s; + + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + u32 sample; + + sample = psi_ext->total[PSI_AVGS][s] - psi_ext->avg_total[s]; + if (sample > period) + sample = period; + psi_ext->avg_total[s] += sample; + calc_avgs(psi_ext->avg[s], missed_periods, sample, period); + } +} +#else +static inline void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) {} +static inline void update_psi_stat_delta(struct psi_group *group, int cpu, + u64 now) {} +static inline void psi_stat_flags_change(struct task_struct *task, + int *stat_set, int *stat_clear, + int set, int clear) {} +static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {} +static inline void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, + u64 period) {} +#endif + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \ + defined(CONFIG_PSI_FINE_GRAINED) +static void record_cpu_stat_times(struct psi_group *group, int cpu) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + u32 delta = ext_groupc->psi_delta; + + if (groupc->state_mask & (1 << PSI_CPU_FULL)) { + if (ext_groupc->prev_throttle == CPU_CFS_BANDWIDTH) + ext_groupc->times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta; +#ifdef CONFIG_QOS_SCHED + else if (ext_groupc->prev_throttle == QOS_THROTTLED) + ext_groupc->times[PSI_CPU_QOS_FULL] += delta; +#endif + } +} + +static void update_throttle_type(struct task_struct *task, int cpu, bool next) +{ + struct cgroup *cpuacct_cgrp; + struct psi_group_ext *psi_ext; + struct psi_group_stat_cpu *groupc; + struct task_group *tsk_grp; + + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + rcu_read_lock(); + cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id); + if (cgroup_parent(cpuacct_cgrp)) { + psi_ext = to_psi_group_ext(cgroup_psi(cpuacct_cgrp)); + groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + tsk_grp = task_group(task); + if (next) + groupc->prev_throttle = groupc->cur_throttle; + groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled; + } + rcu_read_unlock(); + } +} +#else +static inline void record_cpu_stat_times(struct psi_group *group, int cpu) {} +static inline void update_throttle_type(struct task_struct *task, int cpu, + bool next) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + u64 stat_delta[NR_PSI_STAT_STATES] = { 0 }; + u32 stat_times[NR_PSI_STAT_STATES] = { 0 }; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); +#endif u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; u32 changed_states = 0; @@ -365,6 +627,11 @@ static void collect_percpu_times(struct psi_group *group, for (s = 0; s < PSI_NONIDLE; s++) deltas[s] += (u64)times[s] * nonidle; +#ifdef CONFIG_PSI_FINE_GRAINED + get_recent_stat_times(group, cpu, aggregator, stat_times); + for (s = 0; s < NR_PSI_STAT_STATES; s++) + stat_delta[s] += (u64)stat_times[s] * nonidle; +#endif } /* @@ -384,6 +651,12 @@ static void collect_percpu_times(struct psi_group *group, group->total[aggregator][s] += div_u64(deltas[s], max(nonidle_total, 1UL)); +#ifdef CONFIG_PSI_FINE_GRAINED + for (s = 0; s < NR_PSI_STAT_STATES; s++) + psi_ext->total[aggregator][s] += + div_u64(stat_delta[s], max(nonidle_total, 1UL)); +#endif + if (pchanged_states) *pchanged_states = changed_states; } @@ -509,6 +782,7 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, static u64 update_averages(struct psi_group *group, u64 now) { + struct psi_group_ext *psi_ext = to_psi_group_ext(group); unsigned long missed_periods = 0; u64 expires, period; u64 avg_next_update; @@ -557,6 +831,7 @@ static u64 update_averages(struct psi_group *group, u64 now) calc_avgs(group->avg[s], missed_periods, sample, period); } + update_stat_averages(psi_ext, missed_periods, period); return avg_next_update; } @@ -843,8 +1118,10 @@ static void psi_group_change(struct psi_group *group, int cpu, * may have already incorporated the live state into times_prev; * avoid a delta sample underflow when PSI is later re-enabled. */ - if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) { record_times(groupc, now); + record_cpu_stat_times(group, cpu); + } groupc->state_mask = state_mask; @@ -869,6 +1146,7 @@ static void psi_group_change(struct psi_group *group, int cpu, state_mask |= (1 << PSI_MEM_FULL); record_times(groupc, now); + record_cpu_stat_times(group, cpu); groupc->state_mask = state_mask; @@ -884,8 +1162,25 @@ static void psi_group_change(struct psi_group *group, int cpu, static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) + if (static_branch_likely(&psi_cgroups_enabled)) { +#ifndef CONFIG_PSI_CGROUP_V1 return cgroup_psi(task_dfl_cgroup(task)); +#endif +#ifdef CONFIG_CGORUP_CPUACCT + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + if (static_branch_likely(&psi_v1_disabled)) + return &psi_system; + else { + struct cgroup *cgroup; + rcu_read_lock(); + cgroup = task_cgroup(task, cpuacct_cgrp_id); + rcu_read_unlock(); + return cgroup_psi(cgroup); + } + } else + return cgroup_psi(task_dfl_cgroup(task)); +#endif + } #endif return &psi_system; } @@ -910,17 +1205,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) int cpu = task_cpu(task); struct psi_group *group; u64 now; + int stat_set = 0; + int stat_clear = 0; if (!task->pid) return; psi_flags_change(task, clear, set); + psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear); now = cpu_clock(cpu); group = task_psi_group(task); do { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, true); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); } @@ -932,6 +1232,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { + update_throttle_type(next, cpu, true); psi_flags_change(next, 0, TSK_ONCPU); /* * Set TSK_ONCPU on @next's cgroups. If @next shares any @@ -946,14 +1247,20 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_stat_change(group, cpu, 0, 0); } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; bool wake_clock = true; + int stat_set = 0; + int stat_clear = 0; + bool memstall_type_change = false; + update_throttle_type(prev, cpu, false); /* * When we're going to sleep, psi_dequeue() lets us * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and @@ -979,24 +1286,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_flags_change(prev, clear, set); + psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear); group = task_psi_group(prev); do { if (group == common) break; + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); +#ifdef CONFIG_PSI_FINE_GRAINED + if (next->memstall_type != prev->memstall_type) + memstall_type_change = true; +#endif + /* * TSK_ONCPU is handled up to the common ancestor. If there are * any other differences between the two tasks (e.g. prev goes * to sleep, or only one task is memstall), finish propagating * those differences all the way up to the root. */ - if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU || + memstall_type_change) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) + for (; group; group = group->parent) { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu ,stat_clear, stat_set); + } } } } @@ -1009,6 +1328,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; @@ -1023,7 +1345,10 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) write_seqcount_begin(&groupc->seq); + update_psi_stat_delta(group, cpu, now); + record_stat_times(to_psi_group_ext(group), cpu); record_times(groupc, now); + record_cpu_stat_times(group, cpu); groupc->times[PSI_IRQ_FULL] += delta; write_seqcount_end(&groupc->seq); @@ -1045,6 +1370,9 @@ void psi_memstall_enter(unsigned long *flags) { struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_PSI_FINE_GRAINED + unsigned long stat_flags = *flags; +#endif if (static_branch_likely(&psi_disabled)) return; @@ -1052,6 +1380,8 @@ void psi_memstall_enter(unsigned long *flags) *flags = current->in_memstall; if (*flags) return; + + trace_psi_memstall_enter(_RET_IP_); /* * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can @@ -1060,6 +1390,10 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; +#ifdef CONFIG_PSI_FINE_GRAINED + if (stat_flags) + current->memstall_type = stat_flags; +#endif psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); @@ -1082,6 +1416,9 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; + + trace_psi_memstall_leave(_RET_IP_); + /* * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could @@ -1099,16 +1436,36 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext; +#endif + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; +#ifdef CONFIG_PSI_FINE_GRAINED + psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL); + if (!psi_ext) + return -ENOMEM; + psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu); + if (!psi_ext->pcpu) { + kfree(psi_ext); + return -ENOMEM; + } + cgroup->psi = &psi_ext->psi; +#else cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; - +#endif cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); if (!cgroup->psi->pcpu) { +#ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(psi_ext->pcpu); + kfree(psi_ext); +#else kfree(cgroup->psi); +#endif return -ENOMEM; } group_init(cgroup->psi); @@ -1125,7 +1482,12 @@ void psi_cgroup_free(struct cgroup *cgroup) free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); +#ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(to_psi_group_ext(cgroup->psi)->pcpu); + kfree(to_psi_group_ext(cgroup->psi)); +#else kfree(cgroup->psi); +#endif } /** @@ -1620,6 +1982,80 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_PSI_FINE_GRAINED +static const char *const psi_stat_names[] = { + "cgroup_memory_reclaim", + "global_memory_reclaim", + "compact", + "cgroup_async_memory_reclaim", + "swap", + "cpu_cfs_bandwidth", + "cpu_qos", +}; + +static void get_stat_names(struct seq_file *m, int i, bool is_full) +{ + if (i <= PSI_SWAP_FULL && !is_full) + return seq_printf(m, "%s\n", psi_stat_names[i / 2]); + else if (i == PSI_CPU_CFS_BANDWIDTH_FULL) + return seq_printf(m, "%s\n", "cpu_cfs_bandwidth"); +#ifdef CONFIG_QOS_SCHED + else if (i == PSI_CPU_QOS_FULL) + return seq_printf(m, "%s\n", "cpu_qos"); +#endif +} + +int psi_stat_show(struct seq_file *m, struct psi_group *group) +{ + struct psi_group_ext *psi_ext; + unsigned long avg[3] = {0, }; + int i, w; + bool is_full; + u64 now, total; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + psi_ext = to_psi_group_ext(group); + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); + for (i = 0; i < NR_PSI_STAT_STATES; i++) { + is_full = i % 2 || i > PSI_SWAP_FULL; + for (w = 0; w < 3; w++) + avg[w] = psi_ext->avg[i][w]; + total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC); + get_stat_names(m, i, is_full); + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + is_full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + return 0; +} +static int system_psi_stat_show(struct seq_file *m, void *v) +{ + return psi_stat_show(m, &psi_system); +} + +static int psi_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, system_psi_stat_show, NULL); +} + +static const struct proc_ops psi_stat_proc_ops = { + .proc_open = psi_stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = psi_fop_release, +}; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int psi_irq_show(struct seq_file *m, void *v) { @@ -1656,6 +2092,9 @@ static int __init psi_proc_init(void) proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); #ifdef CONFIG_IRQ_TIME_ACCOUNTING proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif +#ifdef CONFIG_PSI_FINE_GRAINED + proc_create("pressure/stat", 0666, NULL, &psi_stat_proc_ops); #endif } return 0; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b31d35fe18c3c6c103a685475e64e..9546cbf02d55afb9e236d9111ac469224b369f08 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -106,6 +106,14 @@ __schedstats_from_se(struct sched_entity *se) return &task_of(se)->stats; } +#ifdef CONFIG_QOS_SCHED +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 +#endif + #ifdef CONFIG_PSI void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, diff --git a/mm/compaction.c b/mm/compaction.c index 38c8d216c6a3bffd9d75fd430981558c66614750..771e9629b95c77e1e9868b40af873356f418e234 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3061,7 +3061,7 @@ static int kcompactd(void *p) pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { - unsigned long pflags; + unsigned long pflags = 0; /* * Avoid the unnecessary wakeup for proactive compaction diff --git a/mm/filemap.c b/mm/filemap.c index 1c398edcfcaf3823fb4d2df562d0b246e8b7f8bd..d0a2beabc68a0408a42bde01e581fff78efdd1eb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1227,7 +1227,7 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; if (bit_nr == PG_locked && @@ -1378,7 +1378,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); @@ -2366,7 +2366,7 @@ static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); - unsigned long pflags; + unsigned long pflags = 0; int error; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d9a873e552244d8f210d189c4e1ae53254d8b4f..74e1fad14a2c27757cc207f0bc95f4f12d97d083 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2410,6 +2410,9 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, memcg_memory_event(memcg, MEMCG_HIGH); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, @@ -2447,6 +2450,9 @@ static void async_reclaim_high(struct mem_cgroup *memcg) return; } +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_ASYNC_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : MEMCG_CHARGE_BATCH; @@ -2681,6 +2687,9 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. */ +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); schedule_timeout_killable(penalty_jiffies); psi_memstall_leave(&pflags); @@ -2742,6 +2751,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, memcg_memory_event(mem_over_limit, MEMCG_MAX); raised_max_event = true; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, reclaim_options); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5b61c1060d19b8b2c046e0ac619644b0f435a17..798a9ec645c04d78d5533100bde7fc0659d5fe26 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3518,6 +3518,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_COMPACT; +#endif psi_memstall_enter(&pflags); delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); @@ -3787,6 +3790,9 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long pflags; bool drained = false; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) diff --git a/mm/page_io.c b/mm/page_io.c index fe4c21af23f269a6bdc913e967f855007f8ccada..95c3616b5db34bde272a487e4d4f2b8d8d1d873e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -509,6 +509,9 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) */ if (workingset) { delayacct_thrashing_start(&in_thrashing); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_SWAP; +#endif psi_memstall_enter(&pflags); } delayacct_swapin_start(); diff --git a/mm/readahead.c b/mm/readahead.c index 6925e6959fd3fff76cdca6b01c348f3daade9c02..e09919547c3bf096a2aeb7ad70c7f506e541de22 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -152,8 +152,12 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; - if (unlikely(rac->_workingset)) + if (unlikely(rac->_workingset)) { +#ifdef CONFIG_PSI_FINE_GRAINED + rac->_pflags = 0; +#endif psi_memstall_enter(&rac->_pflags); + } blk_start_plug(&plug); if (aops->readahead) { @@ -803,6 +807,9 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; +#ifdef CONFIG_PSI_FINE_GRAINED + ractl->_pflags = 0; +#endif psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; @@ -830,6 +837,9 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; +#ifdef CONFIG_PSI_FINE_GRAINED + ractl->_pflags = 0; +#endif psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f13394b112eaea798ca50ff97fe5efa52747a3e..4d753ce139017158d729e2d22b24d9ed299764fa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7393,7 +7393,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long pflags; + unsigned long pflags = 0; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; @@ -8025,6 +8025,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in sc.gfp_mask); cond_resched(); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /*