diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 394fb18c9e3dc1cb9ebdaddf77dd8a6ad9395068..20c35c289253b3d1348e02b009a185cffbe8f8a7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -951,6 +951,12 @@ All cgroup core files are prefixed with "cgroup." it's possible to delete a frozen (and empty) cgroup, as well as create new sub-cgroups. +irq.pressure + A read-write nested-keyed file. + + Shows pressure stall information for IRQ/SOFTIRQ. See + :ref:`Documentation/accounting/psi.rst ` for details. + Controllers =========== diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 34e593350ee775389bc345e84aacf7f363dd2460..7f3daf89bb8edc220cc82a4c19150342d0c01f23 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -104,6 +104,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 82e806651b996b4c117eb2b0e226b1ac928b3f2b..28920c2ed40b45529b0f3f37746420c24af841ec 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -108,6 +108,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 92ce202bd8e5fbdc31d4cd108d43793df49cc5c8..1f2c93e9daa11ce08f1b78726b0567d8da3e63f1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1689,7 +1689,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { - unsigned long pflags; + unsigned long pflags = 0; bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; diff --git a/block/blk-core.c b/block/blk-core.c index 01f0782668ce76a3886421d1a251b923d745548b..71d60ec24a8a8fb40fd130ef49f3db0e0c6cdefa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1116,7 +1116,7 @@ blk_qc_t submit_bio(struct bio *bio) */ if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; + unsigned long pflags = 0; blk_qc_t ret; psi_memstall_enter(&pflags); diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 47263cecb12f4c099829c162ef6d28cda488d943..09f2d58d119b168555d1d83082b1f56c7b9dfad2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -492,9 +492,9 @@ struct cgroup { /* * It is accessed only the cgroup core code and so changes made to * the cgroup structure should not affect third-party kernel modules. + * The member is unused now. */ - struct psi_group psi; - + KABI_DEPRECATE(struct psi_group, psi) /* used to store eBPF programs */ struct cgroup_bpf bpf; @@ -504,7 +504,7 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; - KABI_RESERVE(1) + KABI_USE(1, struct psi_group *psi) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e706ff15ec883e0ff4f7a4346e08bdd9f3720eb6..5b8089c6b3207e56e7802b7918f9c1fb25ff56de 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -675,7 +675,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { - return &cgrp->psi; + return cgrp->psi; } static inline void cgroup_init_kthreadd(void) diff --git a/include/linux/psi.h b/include/linux/psi.h index 86635a5630bab6265cf38bbd0eb1fe5798acfb1c..55bb63a4fd6530526fd08c784b84052dcdbc5d38 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -14,13 +14,16 @@ struct css_set; extern struct static_key_false psi_disabled; extern struct psi_group psi_system; -extern struct static_key_false psi_v1_disabled; +#ifdef CONFIG_PSI_CGROUP_V1 +extern struct static_key_true psi_v1_disabled; +#endif void psi_init(void); void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); @@ -34,6 +37,10 @@ void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +#ifdef CONFIG_PSI_FINE_GRAINED +int psi_stat_show(struct seq_file *s, struct psi_group *group); +#endif + #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0b6e17e7f84f050a6f3bdb5a51ad35bd9a1c3a02..bdefb0b1cd80beb9ba5cec168905bf962f947d27 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -36,13 +36,6 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - /* - * This can't have values other than 0 or 1 and could be - * implemented as a bit flag. But for now we still have room - * in the first cacheline of psi_group_cpu, and this way we - * don't have to special case any state tracking for it. - */ - NR_ONCPU, /* * For IO and CPU stalls the presence of running/oncpu tasks * in the domain means a partial rather than a full stall. @@ -53,7 +46,7 @@ enum psi_task_count { * threads and memstall ones. */ NR_MEMSTALL_RUNNING, - NR_PSI_TASK_COUNTS = 5, + NR_PSI_TASK_COUNTS = 4, }; #endif @@ -61,15 +54,19 @@ enum psi_task_count { #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) -#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) /* Resources that workloads could be stalled on */ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES = 3, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, }; /* @@ -104,12 +101,17 @@ enum psi_states { PSI_MEM_FULL, PSI_CPU_SOME, PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 7, + NR_PSI_STATES, }; #endif +/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES) enum psi_aggregators { PSI_AVGS = 0, @@ -229,10 +231,85 @@ struct psi_group { u64 polling_until; }; +#ifdef CONFIG_PSI_FINE_GRAINED + +enum psi_stat_states { + PSI_MEMCG_RECLAIM_SOME, + PSI_MEMCG_RECLAIM_FULL, + PSI_GLOBAL_RECLAIM_SOME, + PSI_GLOBAL_RECLAIM_FULL, + PSI_COMPACT_SOME, + PSI_COMPACT_FULL, + PSI_ASYNC_MEMCG_RECLAIM_SOME, + PSI_ASYNC_MEMCG_RECLAIM_FULL, + PSI_SWAP_SOME, + PSI_SWAP_FULL, + PSI_CPU_CFS_BANDWIDTH_FULL, +#ifdef CONFIG_QOS_SCHED + PSI_CPU_QOS_FULL, +#endif + NR_PSI_STAT_STATES, +}; + +enum psi_stat_task_count { + NR_MEMCG_RECLAIM, + NR_MEMCG_RECLAIM_RUNNING, + NR_GLOBAL_RECLAIM, + NR_GLOBAL_RECLAIM_RUNNING, + NR_COMPACT, + NR_COMPACT_RUNNING, + NR_ASYNC_MEMCG_RECLAIM, + NR_ASYNC_MEMCG_RECLAIM_RUNNING, + NR_SWAP, + NR_SWAP_RUNNING, + NR_PSI_STAT_TASK_COUNTS, +}; + +#define CPU_CFS_BANDWIDTH 1 + +struct psi_group_stat_cpu { + u32 state_mask; + u32 times[NR_PSI_STAT_STATES]; + u32 psi_delta; + unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; + u32 times_delta; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + int prev_throttle; + int cur_throttle; +}; + +struct psi_group_ext { + struct psi_group psi; + struct psi_group_stat_cpu __percpu *pcpu; + /* Running fine grained pressure averages */ + u64 avg_total[NR_PSI_STAT_STATES]; + /* Total fine grained stall times and sampled pressure averages */ + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + unsigned long avg[NR_PSI_STAT_STATES][3]; +}; +#endif /* CONFIG_PSI_FINE_GRAINED */ + #else /* CONFIG_PSI */ struct psi_group { }; #endif /* CONFIG_PSI */ +/* + * one type should have two task stats: regular running and memstall + * threads. The reason is the same as NR_MEMSTALL_RUNNING. + * Because of the psi_memstall_type is start with 1, the correspondence + * between psi_memstall_type and psi_stat_task_count should be as below: + * + * memstall : psi_memstall_type * 2 - 2; + * running : psi_memstall_type * 2 - 1; + */ +enum psi_memstall_type { + PSI_MEMCG_RECLAIM = 1, + PSI_GLOBAL_RECLAIM, + PSI_COMPACT, + PSI_ASYNC_MEMCG_RECLAIM, + PSI_SWAP, +}; + #endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d39427f8044d3a5a4b57f4ee9e7aefbe82b3c4a5..0a4c6a6214c4778e83e4d5f8031bbb4a1e810c09 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1449,7 +1449,11 @@ struct task_struct { KABI_RESERVE(10) KABI_RESERVE(11) #endif +#ifdef CONFIG_PSI_FINE_GRAINED + KABI_USE(12, int memstall_type) +#else KABI_RESERVE(12) +#endif KABI_RESERVE(13) KABI_RESERVE(14) KABI_RESERVE(15) diff --git a/init/Kconfig b/init/Kconfig index 83714edd7bf9f2db51b76c36dad71ede11ba6006..f5e32e1ba26f65b1315e492c70a31a01cb07b7d7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -653,6 +653,26 @@ config PSI_DEFAULT_DISABLED Say N if unsure. +config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default n + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + +config PSI_FINE_GRAINED + bool "Support fine grained psi under cgroup v1 and system" + default n + depends on PSI + help + If set, fine grained pressure stall information tracking will + be used for cgroup v1 and system, such as memory reclaim, + memory compact and so on. + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3d778636f2e8195fb60827f60ccf56bcb85fd508..c68b81a0c57360115b2e327bb28d65a44d029f3e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3677,21 +3677,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -3717,7 +3717,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res, of); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -3751,6 +3751,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { @@ -3766,6 +3783,17 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) psi_trigger_destroy(ctx->psi.trigger); } +#ifdef CONFIG_PSI_FINE_GRAINED +static int cgroup_psi_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + + return psi_stat_show(seq, psi); +} +#endif + +#ifdef CONFIG_PSI_CGROUP_V1 struct cftype cgroup_v1_psi_files[] = { { .name = "io.pressure", @@ -3791,8 +3819,27 @@ struct cftype cgroup_v1_psi_files[] = { .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif +#ifdef CONFIG_PSI_FINE_GRAINED + { + .name = "pressure.stat", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_psi_stat_show, + }, +#endif { } /* terminate */ }; +#endif + #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) @@ -5155,6 +5202,15 @@ static struct cftype cgroup_base_files[] = { .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif #endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ebd8c3a6a964f71eb483e62b5006daa5c9eb2b55..92ba14c0bcaa04a0c832e9c730d1834b1d305eea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -629,6 +629,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + if (irq_delta) + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 28ed182b6801cda325d7c93ebf7cb938168ff714..7a7a0dec8c4e053ff00a73077b3c3d6ac10747df 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -375,7 +375,7 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .early_init = true, }; -#ifdef CONFIG_PSI +#ifdef CONFIG_PSI_CGROUP_V1 static bool psi_v1_enable; static int __init setup_psi_v1(char *str) @@ -383,8 +383,8 @@ static int __init setup_psi_v1(char *str) int ret; ret = kstrtobool(str, &psi_v1_enable); - if (!psi_v1_enable) - static_branch_enable(&psi_v1_disabled); + if (psi_v1_enable) + static_branch_disable(&psi_v1_disabled); return ret == 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06c6318b3ba3cd0fe25880f0a6773969a0be6284..61b077d630d63ba52d29ac78ffa4d72c57dcf708 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -125,12 +125,6 @@ int __weak arch_asym_cpu_priority(int cpu) #ifdef CONFIG_QOS_SCHED -/* - * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled - * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). - */ -#define QOS_THROTTLED 2 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 11a43dccb7fcbaad9279eb294376b8d1fa6d65a2..5789b07e59dfb65f192c601db6192defa57e7749 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -156,7 +156,10 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); -DEFINE_STATIC_KEY_FALSE(psi_v1_disabled); + +#ifdef CONFIG_PSI_CGROUP_V1 +DEFINE_STATIC_KEY_TRUE(psi_v1_disabled); +#endif #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -189,6 +192,27 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +#ifdef CONFIG_PSI_FINE_GRAINED +/* System-level fine grained pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu); +struct psi_group_ext psi_stat_system = { + .pcpu = &system_stat_group_pcpu, +}; + +struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + if (psi == &psi_system) + return &psi_stat_system; + else + return container_of(psi, struct psi_group_ext, psi); +} +#else +static inline struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + return NULL; +} +#endif + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -206,12 +230,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -228,7 +248,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -241,9 +261,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -256,6 +276,10 @@ static void get_recent_times(struct psi_group *group, int cpu, enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); +#endif struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); u64 now, state_start; enum psi_states s; @@ -295,6 +319,9 @@ static void get_recent_times(struct psi_group *group, int cpu, if (delta) *pchanged_states |= (1 << s); } +#ifdef CONFIG_PSI_FINE_GRAINED + ext_groupc->times_delta = now - state_start; +#endif } static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -317,10 +344,240 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } +#ifdef CONFIG_PSI_FINE_GRAINED + +static void record_stat_times(struct psi_group_ext *psi_ext, int cpu) +{ + struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu); + + u32 delta = ext_grpc->psi_delta; + + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_SOME)) { + ext_grpc->times[PSI_GLOBAL_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_FULL)) + ext_grpc->times[PSI_GLOBAL_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_COMPACT_SOME)) { + ext_grpc->times[PSI_COMPACT_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_COMPACT_FULL)) + ext_grpc->times[PSI_COMPACT_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_SWAP_SOME)) { + ext_grpc->times[PSI_SWAP_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_SWAP_FULL)) + ext_grpc->times[PSI_SWAP_FULL] += delta; + } +} + +static bool test_fine_grained_stat(unsigned int *stat_tasks, + unsigned int nr_running, + enum psi_stat_states state) +{ + switch (state) { + case PSI_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM]); + case PSI_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + case PSI_GLOBAL_RECLAIM_SOME: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM]); + case PSI_GLOBAL_RECLAIM_FULL: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM] && + nr_running == stat_tasks[NR_GLOBAL_RECLAIM_RUNNING]); + case PSI_COMPACT_SOME: + return unlikely(stat_tasks[NR_COMPACT]); + case PSI_COMPACT_FULL: + return unlikely(stat_tasks[NR_COMPACT] && + nr_running == stat_tasks[NR_COMPACT_RUNNING]); + case PSI_ASYNC_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM]); + case PSI_ASYNC_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_ASYNC_MEMCG_RECLAIM_RUNNING]); + case PSI_SWAP_SOME: + return unlikely(stat_tasks[NR_SWAP]); + case PSI_SWAP_FULL: + return unlikely(stat_tasks[NR_SWAP] && + nr_running == stat_tasks[NR_SWAP_RUNNING]); + default: + return false; + } +} + +static void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) +{ + int t; + u32 state_mask = 0; + enum psi_stat_states s; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + record_stat_times(psi_ext, cpu); + + for (t = 0; clear; clear &= ~(1 << t), t++) + if (clear & (1 << t)) + ext_groupc->tasks[t]--; + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + ext_groupc->tasks[t]++; + for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++) + if (test_fine_grained_stat(ext_groupc->tasks, + groupc->tasks[NR_RUNNING], s)) + state_mask |= (1 << s); + if (unlikely(groupc->state_mask & PSI_ONCPU) && + cpu_curr(cpu)->memstall_type) + state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1)); + + ext_groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); +} + +static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + ext_groupc->psi_delta = now - groupc->state_start; +} + +static void psi_stat_flags_change(struct task_struct *task, int *stat_set, + int *stat_clear, int set, int clear) +{ + if (!task->memstall_type) + return; + + if (clear) { + if (clear & TSK_MEMSTALL) + *stat_clear |= 1 << (2 * task->memstall_type - 2); + if (clear & TSK_MEMSTALL_RUNNING) + *stat_clear |= 1 << (2 * task->memstall_type - 1); + } + if (set) { + if (set & TSK_MEMSTALL) + *stat_set |= 1 << (2 * task->memstall_type - 2); + if (set & TSK_MEMSTALL_RUNNING) + *stat_set |= 1 << (2 * task->memstall_type - 1); + } + if (!task->in_memstall) + task->memstall_type = 0; +} + +static void get_recent_stat_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + enum psi_stat_states s; + u32 delta; + + memcpy(times, ext_groupc->times, sizeof(ext_groupc->times)); + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + if (ext_groupc->state_mask & (1 << s)) + times[s] += ext_groupc->times_delta; + delta = times[s] - ext_groupc->times_prev[aggregator][s]; + ext_groupc->times_prev[aggregator][s] = times[s]; + times[s] = delta; + } +} + +static void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, u64 period) +{ + int s; + + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + u32 sample; + + sample = psi_ext->total[PSI_AVGS][s] - psi_ext->avg_total[s]; + if (sample > period) + sample = period; + psi_ext->avg_total[s] += sample; + calc_avgs(psi_ext->avg[s], missed_periods, sample, period); + } +} +#else +static inline void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) {} +static inline void update_psi_stat_delta(struct psi_group *group, int cpu, + u64 now) {} +static inline void psi_stat_flags_change(struct task_struct *task, + int *stat_set, int *stat_clear, + int set, int clear) {} +static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {} +static inline void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, + u64 period) {} +#endif + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \ + defined(CONFIG_PSI_FINE_GRAINED) +static void record_cpu_stat_times(struct psi_group *group, int cpu) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + u32 delta = ext_groupc->psi_delta; + + if (groupc->state_mask & (1 << PSI_CPU_FULL)) { + if (ext_groupc->prev_throttle == CPU_CFS_BANDWIDTH) + ext_groupc->times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta; +#ifdef CONFIG_QOS_SCHED + else if (ext_groupc->prev_throttle == QOS_THROTTLED) + ext_groupc->times[PSI_CPU_QOS_FULL] += delta; +#endif + } +} + +static void update_throttle_type(struct task_struct *task, int cpu, bool next) +{ + struct cgroup *cpuacct_cgrp; + struct psi_group_ext *psi_ext; + struct psi_group_stat_cpu *groupc; + struct task_group *tsk_grp; + + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + rcu_read_lock(); + cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id); + if (cgroup_parent(cpuacct_cgrp)) { + psi_ext = to_psi_group_ext(cgroup_psi(cpuacct_cgrp)); + groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + tsk_grp = task_group(task); + if (next) + groupc->prev_throttle = groupc->cur_throttle; + groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled; + } + rcu_read_unlock(); + } +} +#else +static inline void record_cpu_stat_times(struct psi_group *group, int cpu) {} +static inline void update_throttle_type(struct task_struct *task, int cpu, + bool next) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + u64 stat_delta[NR_PSI_STAT_STATES] = { 0 }; + u32 stat_times[NR_PSI_STAT_STATES] = { 0 }; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); +#endif u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; u32 changed_states = 0; @@ -349,6 +606,11 @@ static void collect_percpu_times(struct psi_group *group, for (s = 0; s < PSI_NONIDLE; s++) deltas[s] += (u64)times[s] * nonidle; +#ifdef CONFIG_PSI_FINE_GRAINED + get_recent_stat_times(group, cpu, aggregator, stat_times); + for (s = 0; s < NR_PSI_STAT_STATES; s++) + stat_delta[s] += (u64)stat_times[s] * nonidle; +#endif } /* @@ -368,12 +630,19 @@ static void collect_percpu_times(struct psi_group *group, group->total[aggregator][s] += div_u64(deltas[s], max(nonidle_total, 1UL)); +#ifdef CONFIG_PSI_FINE_GRAINED + for (s = 0; s < NR_PSI_STAT_STATES; s++) + psi_ext->total[aggregator][s] += + div_u64(stat_delta[s], max(nonidle_total, 1UL)); +#endif + if (pchanged_states) *pchanged_states = changed_states; } static u64 update_averages(struct psi_group *group, u64 now) { + struct psi_group_ext *psi_ext = to_psi_group_ext(group); unsigned long missed_periods = 0; u64 expires, period; u64 avg_next_update; @@ -422,6 +691,7 @@ static u64 update_averages(struct psi_group *group, u64 now) calc_avgs(group->avg[s], missed_periods, sample, period); } + update_stat_averages(psi_ext, missed_periods, period); return avg_next_update; } @@ -696,9 +966,9 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -713,18 +983,38 @@ static void psi_group_change(struct psi_group *group, int cpu, write_seqcount_begin(&groupc->seq); record_times(groupc, now); + record_cpu_stat_times(group, cpu); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -733,9 +1023,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -747,7 +1036,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); groupc->state_mask = state_mask; @@ -767,21 +1056,23 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) struct cgroup *cgroup = NULL; if (!*iter) { - if (static_branch_likely(&psi_v1_disabled)) - cgroup = task->cgroups->dfl_cgrp; - else { +#ifndef CONFIG_PSI_CGROUP_V1 + cgroup = task->cgroups->dfl_cgrp; +#else #ifdef CONFIG_CGROUP_CPUACCT - if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + if (!static_branch_likely(&psi_v1_disabled)) { rcu_read_lock(); cgroup = task_cgroup(task, cpuacct_cgrp_id); rcu_read_unlock(); - } else { - cgroup = task->cgroups->dfl_cgrp; } + } else { + cgroup = task->cgroups->dfl_cgrp; + } #else - cgroup = NULL; + cgroup = NULL; +#endif #endif - } } else if (*iter == &psi_system) return NULL; else @@ -818,29 +1109,24 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; void *iter = NULL; u64 now; + int stat_set = 0; + int stat_clear = 0; if (!task->pid) return; psi_flags_change(task, clear, set); + psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + while ((group = iterate_groups(task, &iter))) { + update_psi_stat_delta(group, cpu, now); + psi_group_change(group, cpu, clear, set, now, true); + psi_group_stat_change(group, cpu, stat_clear, stat_set); + } } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -852,32 +1138,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - + update_throttle_type(next, cpu, true); psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_stat_change(group, cpu, 0, 0); } } if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; + int stat_set = 0; + int stat_clear = 0; + bool memstall_type_change = false; + update_throttle_type(prev, cpu, false); /* * When we're going to sleep, psi_dequeue() lets us * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and @@ -890,26 +1179,83 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); + psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear); iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); - + while ((group = iterate_groups(prev, &iter)) && group != common) { + update_psi_stat_delta(group, cpu, now); + psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); + } +#ifdef CONFIG_PSI_FINE_GRAINED + if (next->memstall_type != prev->memstall_type) + memstall_type_change = true; +#endif /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU || + memstall_type_change) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + for (; group; group = iterate_groups(prev, &iter)) { + update_psi_stat_delta(group, cpu, now); + psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, + stat_set); + } } } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + void *iter = NULL; + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + while ((group = iterate_groups(task, &iter))) { + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + update_psi_stat_delta(group, cpu, now); + record_stat_times(to_psi_group_ext(group), cpu); + record_times(groupc, now); + record_cpu_stat_times(group, cpu); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -921,6 +1267,9 @@ void psi_memstall_enter(unsigned long *flags) { struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_PSI_FINE_GRAINED + unsigned long stat_flags = *flags; +#endif if (static_branch_likely(&psi_disabled)) return; @@ -938,6 +1287,10 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; +#ifdef CONFIG_PSI_FINE_GRAINED + if (stat_flags) + current->memstall_type = stat_flags; +#endif psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); @@ -961,6 +1314,7 @@ void psi_memstall_leave(unsigned long *flags) return; trace_psi_memstall_leave(_RET_IP_); + /* * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could @@ -977,13 +1331,40 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext; +#endif + if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) +#ifdef CONFIG_PSI_FINE_GRAINED + psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL); + if (!psi_ext) + return -ENOMEM; + psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu); + if (!psi_ext->pcpu) { + kfree(psi_ext); + return -ENOMEM; + } + cgroup->psi = &psi_ext->psi; +#else + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) + return -ENOMEM; + +#endif + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { +#ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(psi_ext->pcpu); + kfree(psi_ext); +#else + kfree(cgroup->psi); +#endif return -ENOMEM; - group_init(&cgroup->psi); + } + group_init(cgroup->psi); return 0; } @@ -992,10 +1373,16 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); +#ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(to_psi_group_ext(cgroup->psi)->pcpu); + kfree(to_psi_group_ext(cgroup->psi)); +#else + kfree(cgroup->psi); +#endif } /** @@ -1068,6 +1455,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1082,7 +1470,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1096,7 +1488,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1106,36 +1498,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_io_show, NULL); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_memory_show, NULL); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_cpu_show, NULL); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res, struct kernfs_open_file *of) @@ -1155,6 +1517,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1304,6 +1671,37 @@ __poll_t psi_trigger_poll(void **trigger_ptr, return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1407,6 +1805,107 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_PSI_FINE_GRAINED +static const char *const psi_stat_names[] = { + "cgroup_memory_reclaim", + "global_memory_reclaim", + "compact", + "cgroup_async_memory_reclaim", + "swap", + "cpu_cfs_bandwidth", + "cpu_qos", +}; + +static void get_stat_names(struct seq_file *m, int i, bool is_full) +{ + if (i <= PSI_SWAP_FULL && !is_full) + return seq_printf(m, "%s\n", psi_stat_names[i / 2]); + else if (i == PSI_CPU_CFS_BANDWIDTH_FULL) + return seq_printf(m, "%s\n", "cpu_cfs_bandwidth"); +#ifdef CONFIG_QOS_SCHED + else if (i == PSI_CPU_QOS_FULL) + return seq_printf(m, "%s\n", "cpu_qos"); +#endif +} + +int psi_stat_show(struct seq_file *m, struct psi_group *group) +{ + struct psi_group_ext *psi_ext; + unsigned long avg[3] = {0, }; + int i, w; + bool is_full; + u64 now, total; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + psi_ext = to_psi_group_ext(group); + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); + for (i = 0; i < NR_PSI_STAT_STATES; i++) { + is_full = i % 2 || i > PSI_SWAP_FULL; + for (w = 0; w < 3; w++) + avg[w] = psi_ext->avg[i][w]; + total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC); + get_stat_names(m, i, is_full); + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + is_full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + return 0; +} +static int system_psi_stat_show(struct seq_file *m, void *v) +{ + return psi_stat_show(m, &psi_system); +} + +static int psi_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, system_psi_stat_show, NULL); +} + +static const struct proc_ops psi_stat_proc_ops = { + .proc_open = psi_stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = psi_fop_release, +}; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_irq_show, NULL); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1414,7 +1913,15 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0, NULL, &psi_irq_proc_ops); +#endif +#ifdef CONFIG_PSI_FINE_GRAINED + proc_create("pressure/stat", 0, NULL, &psi_stat_proc_ops); +#endif } return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b8b4e5b2694e94017e6c4e41451ca8bbb9c0a616..4fc84b0e29450d97cf12062eb956c60aff1210cd 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -75,6 +75,14 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_QOS_SCHED +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 +#endif + #ifdef CONFIG_PSI /* * PSI tracks state that persists across sleeps, such as iowaits and @@ -170,6 +178,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/mm/compaction.c b/mm/compaction.c index a193af836ee6994705d892d271d6eb2544ac062a..bdcde6ea7f97eb572bf87b1bf12ba0d79868dfc4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2852,7 +2852,7 @@ static int kcompactd(void *p) pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { - unsigned long pflags; + unsigned long pflags = 0; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, diff --git a/mm/filemap.c b/mm/filemap.c index fd4aae06ff150cbee001838854eb95e065695d94..04e4aad7ed67e42541bfd4bbe7ed7fdbb56c1053 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1194,7 +1194,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; bool delayacct = false; - unsigned long pflags; + unsigned long pflags = 0; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1351,7 +1351,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; bool delayacct = false; - unsigned long pflags; + unsigned long pflags = 0; wait_queue_head_t *q; struct page *page = compound_head(migration_entry_to_page(entry)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2da152a09ea3a90956fd9383f18550c19e0f5452..b4607e8e557b8c9491bb675b2e67c2c650e8bd8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -110,18 +110,14 @@ static bool do_memsw_account(void) #define SOFTLIMIT_EVENTS_TARGET 1024 /* - * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_DEFAULT, memcg async + * memcg warning watermark = memory.high * memcg->high_async_ratio / + * HIGH_ASYNC_RATIO_BASE. + * when memcg usage is larger than warning watermark, but smaller than + * memory.high, start memcg async reclaim; + * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_BASE, memcg async * relcaim is disabled; - * when mem_usage is larger than memory.high * memcg->high_async_ratio/ - * HIGH_ASYNC_RATIO_BASE, start async reclaim; - * if mem_usage is larger than memory.high * (memcg->high_async_ratio - - * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE, the aim reclaim page is - * the diff of mem_usage and memory.high * (memcg->high_async_ratio - - * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE else the aim reclaim - * page is MEMCG_CHARGE_BATCH; - */ + * */ -#define HIGH_ASYNC_RATIO_DEFAULT 0 #define HIGH_ASYNC_RATIO_BASE 100 #define HIGH_ASYNC_RATIO_GAP 10 @@ -2370,15 +2366,13 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) static bool is_high_async_reclaim(struct mem_cgroup *memcg) { int ratio = READ_ONCE(memcg->high_async_ratio); + unsigned long memcg_high = READ_ONCE(memcg->memory.high); - if (ratio == HIGH_ASYNC_RATIO_DEFAULT) - return false; - - if (READ_ONCE(memcg->memory.high) == PAGE_COUNTER_MAX) + if (ratio == HIGH_ASYNC_RATIO_BASE || memcg_high == PAGE_COUNTER_MAX) return false; return page_counter_read(&memcg->memory) > - (READ_ONCE(memcg->memory.high) * ratio / HIGH_ASYNC_RATIO_BASE); + memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; } static unsigned long reclaim_high(struct mem_cgroup *memcg, @@ -2386,25 +2380,19 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { unsigned long nr_reclaimed = 0; - bool high_async_reclaim = READ_ONCE(memcg->high_async_reclaim); - - if (high_async_reclaim) - WRITE_ONCE(memcg->high_async_reclaim, false); do { unsigned long pflags; - if (high_async_reclaim) { - if (!is_high_async_reclaim(memcg)) - continue; - } else { - if (page_counter_read(&memcg->memory) <= - READ_ONCE(memcg->memory.high)) - continue; - } + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) + continue; memcg_memory_event(memcg, MEMCG_HIGH); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, @@ -2416,27 +2404,37 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, return nr_reclaimed; } -static unsigned long get_reclaim_pages(struct mem_cgroup *memcg) +static void async_reclaim_high(struct mem_cgroup *memcg) { - unsigned long nr_pages = page_counter_read(&memcg->memory); - int ratio = READ_ONCE(memcg->high_async_ratio); - unsigned long safe_pages; + unsigned long nr_pages, pflags; + unsigned long memcg_high = READ_ONCE(memcg->memory.high); + unsigned long memcg_usage = page_counter_read(&memcg->memory); + int ratio = READ_ONCE(memcg->high_async_ratio) - HIGH_ASYNC_RATIO_GAP; + unsigned long safe_pages = memcg_high * ratio / HIGH_ASYNC_RATIO_BASE; - ratio = ratio < HIGH_ASYNC_RATIO_GAP ? 0 : ratio - HIGH_ASYNC_RATIO_GAP; - safe_pages = READ_ONCE(memcg->memory.high) * ratio / - HIGH_ASYNC_RATIO_BASE; + if (!is_high_async_reclaim(memcg)) { + WRITE_ONCE(memcg->high_async_reclaim, false); + return; + } - return (nr_pages > safe_pages) ? (nr_pages - safe_pages) : - MEMCG_CHARGE_BATCH; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_ASYNC_MEMCG_RECLAIM; +#endif + psi_memstall_enter(&pflags); + nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : + MEMCG_CHARGE_BATCH; + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true); + psi_memstall_leave(&pflags); + WRITE_ONCE(memcg->high_async_reclaim, false); } static void high_work_func(struct work_struct *work) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = container_of(work, struct mem_cgroup, + high_work); - memcg = container_of(work, struct mem_cgroup, high_work); - if (memcg->high_async_reclaim) - reclaim_high(memcg, get_reclaim_pages(memcg), GFP_KERNEL); + if (READ_ONCE(memcg->high_async_reclaim)) + async_reclaim_high(memcg); else reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } @@ -2653,6 +2651,9 @@ void mem_cgroup_handle_over_high(void) * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. */ +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); schedule_timeout_killable(penalty_jiffies); psi_memstall_leave(&pflags); @@ -2723,7 +2724,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, goto nomem; memcg_memory_event(mem_over_limit, MEMCG_MAX); - +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, reclaim_options); @@ -2825,9 +2828,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, continue; } - if (is_high_async_reclaim(memcg)) { + if (is_high_async_reclaim(memcg) && !mem_high) { WRITE_ONCE(memcg->high_async_reclaim, true); schedule_work(&memcg->high_work); + break; } if (mem_high || swap_high) { @@ -5737,7 +5741,7 @@ static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of, return ret; if (high_async_ratio >= HIGH_ASYNC_RATIO_BASE || - high_async_ratio < HIGH_ASYNC_RATIO_DEFAULT) + high_async_ratio < HIGH_ASYNC_RATIO_GAP) return -EINVAL; WRITE_ONCE(memcg->high_async_ratio, high_async_ratio); @@ -6359,7 +6363,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; - memcg->high_async_ratio = HIGH_ASYNC_RATIO_DEFAULT; + memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f21365c92a98309024ae4809418f52a15d327098..d2a8ec19315124c18c1214a22f94a0fc1b3f63a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4178,6 +4178,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_COMPACT; +#endif psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); @@ -4447,6 +4450,9 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long pflags; bool drained = false; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) diff --git a/mm/page_io.c b/mm/page_io.c index ee28c39e566e48d5e1e72e723209582367790827..78de95b9ef5aa7e61eb5b4cc3d58b332fb6100ec 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -341,6 +341,9 @@ int swap_readpage(struct page *page, bool synchronous) * or the submitting cgroup IO-throttled, submission can be a * significant part of overall IO time. */ +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_SWAP; +#endif psi_memstall_enter(&pflags); if (frontswap_load(page) == 0) { diff --git a/mm/vmscan.c b/mm/vmscan.c index dbd0757dd5a13d021d188c72498e8c5229559499..3d383c7126e3f464554d9a2e2351ad7e6e160450 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3802,7 +3802,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long pflags; + unsigned long pflags = 0; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; @@ -4448,6 +4448,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in sc.gfp_mask); cond_resched(); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask);