From e5bf95d5b9168670dd4fef0b360b3e66361529e6 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 9 Mar 2022 14:58:23 +0800 Subject: [PATCH 1/6] anolis: sched: introduce ACPU accounting ANBZ: #6729 When SMT is on, tasks will be disturbed by the tasks on it's SMT sibling, which will make the tasks running sometimes fast and sometimes slowly. So far, there isn't any way to assess how much disturbance the task has received. To assess the SMT disturbance, we introduce ACPU(assess CPU), which will account how long the task is running with SMT sibling idle. The statistical data is shown in /proc//sched, row se.core_sibidletime. Only when kernel.sched_schedstats is on, the data will be counted and shown. Co-developed-by: Cruz Zhao Signed-off-by: Tianchen Ding Signed-off-by: Cruz Zhao --- include/linux/kernel_stat.h | 6 +++ include/linux/sched.h | 10 +++++ kernel/sched/core.c | 90 +++++++++++++++++++++++++++++++++++++ kernel/sched/cputime.c | 8 ++++ kernel/sched/debug.c | 3 ++ kernel/sched/sched.h | 6 +++ kernel/smpboot.c | 1 + lib/Kconfig.debug | 7 +++ 8 files changed, 131 insertions(+) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index ca7ac6734c41..90d4b8c4c94d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -30,6 +30,9 @@ enum cpu_usage_stat { CPUTIME_GUEST_NICE, #ifdef CONFIG_SCHED_CORE CPUTIME_FORCEIDLE, +#endif +#ifdef CONFIG_SCHED_ACPU + CPUTIME_SIBIDLE, #endif NR_STATS, }; @@ -122,5 +125,8 @@ extern void account_idle_ticks(unsigned long ticks); #ifdef CONFIG_SCHED_CORE extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); #endif +#ifdef CONFIG_SCHED_ACPU +extern void __account_sibidle_time(struct task_struct *tsk, u64 delta); +#endif #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5b04709f47b2..260568830e3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -500,6 +500,10 @@ struct sched_statistics { u64 core_forceidle_sum; #endif +#ifdef CONFIG_SCHED_ACPU + u64 core_sibidle_sum; +#endif + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -2347,4 +2351,10 @@ static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif +#ifdef CONFIG_SCHED_ACPU +extern void acpu_enable(void); +#else +static inline void acpu_enable(void) { } +#endif + #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a55f28d876d0..ae32d0780f26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -77,6 +77,10 @@ unsigned int sysctl_sched_cfs_bw_burst_onset_percent; unsigned int sysctl_sched_cfs_bw_burst_enabled = 1; #endif +#ifdef CONFIG_SCHED_ACPU +DEFINE_STATIC_KEY_FALSE(acpu_enabled); +#endif + /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -3913,6 +3917,84 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +#ifdef CONFIG_SCHED_ACPU +void acpu_enable(void) +{ + int i; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + /* It may be not that accurate, but useful enough. */ + rq->last_acpu_update_time = rq->clock; + } + static_branch_enable(&acpu_enabled); +} + +static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + const int cpu = cpu_of(rq); + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + u64 now = rq_clock(rq); + u64 sibidle_sum, last_update_time; + s64 delta, last; + int i; + + if (!static_branch_likely(&acpu_enabled) || !schedstat_enabled()) + return; + + /* Update idle sum and busy sum for current rq. */ + delta = now - rq->last_acpu_update_time; + if (prev == rq->idle) + rq->acpu_idle_sum += delta; + + /* + * Be carefule, smt_mask maybe NULL. + * We only consider the case where there are two SMT at this stage. + */ + if (unlikely(!smt_mask) || unlikely(cpumask_weight(smt_mask) != 2)) + goto out; + + for_each_cpu(i, smt_mask) { + if (i != cpu) { + struct rq *rq_i = cpu_rq(i); + struct task_struct *curr_i = rq_i->curr; + + last = (s64)(rq->last_acpu_update_time - + rq_i->last_acpu_update_time); + last_update_time = last >= 0 ? rq->last_acpu_update_time : + rq_i->last_acpu_update_time; + /* + * Sibling may update acpu at the same time, and it's + * timestamp may be newer than this rq. + */ + delta = now - last_update_time; + delta = delta > 0 ? delta : 0; + + /* Add the delta to improve accuracy. */ + sibidle_sum = last >= 0 ? rq->sibidle_sum : rq_i->acpu_idle_sum; + if (curr_i == rq_i->idle) + sibidle_sum += delta; + } + } + + if (prev != rq->idle) { + delta = sibidle_sum - rq->sibidle_sum; + delta = delta > 0 ? delta : 0; + __account_sibidle_time(prev, delta); + } + + if (next != rq->idle) + rq->sibidle_sum = sibidle_sum; +out: + rq->last_acpu_update_time = now; +} +#else +static inline void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ +} +#endif /* CONFIG_SCHED_ACPU */ + static inline void prepare_task(struct task_struct *next) { #ifdef CONFIG_SMP @@ -4004,6 +4086,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, { kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); + update_acpu(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); @@ -4456,6 +4539,7 @@ void scheduler_tick(void) thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + update_acpu(rq, curr, curr); calc_global_load_tick(rq); psi_task_tick(rq); sched_core_tick(rq); @@ -8171,6 +8255,12 @@ void __init sched_init(void) rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ + +#ifdef CONFIG_SCHED_ACPU + rq->acpu_idle_sum = 0; + rq->sibidle_sum = 0; + rq->last_acpu_update_time = rq->clock; +#endif hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); #if defined(CONFIG_GROUP_IDENTITY) && defined(CONFIG_SCHED_SMT) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 48a0a4fa02ef..56b446412467 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -238,6 +238,14 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } #endif +#ifdef CONFIG_SCHED_ACPU +void __account_sibidle_time(struct task_struct *p, u64 delta) +{ + __schedstat_add(p->se.statistics.core_sibidle_sum, delta); + + task_group_account_field(p, CPUTIME_SIBIDLE, delta); +} +#endif /* * When a guest is interrupted for a longer amount of time, missed clock diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d30ea76022e..3e1fd969ff9b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1025,6 +1025,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_CORE PN_SCHEDSTAT(se.statistics.core_forceidle_sum); +#endif +#ifdef CONFIG_SCHED_ACPU + PN_SCHEDSTAT(se.statistics.core_sibidle_sum); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 448c3fd61680..c27595620fb6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1319,6 +1319,12 @@ struct rq { struct task_struct *force_idled_core_pick; #endif +#ifdef CONFIG_SCHED_ACPU + u64 acpu_idle_sum; + u64 sibidle_sum; + u64 last_acpu_update_time; +#endif + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e4163042c4d6..cd94ea587d3e 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -73,6 +73,7 @@ void __init idle_threads_init(void) if (cpu != boot_cpu) idle_init(cpu); } + acpu_enable(); } #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7c63c6787adf..6684c5a2e71f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1132,6 +1132,13 @@ config SCHEDSTATS_HOST this config. If Y here, the default value of it is 1, and if N, the value is 0. +config SCHED_ACPU + bool "ACPU info: account idle time of smt to task" + depends on DEBUG_KERNEL && PROC_FS && SMP && SCHED_SMT + default y + help + Add ACPU info in /proc//sched. + endmenu config DEBUG_TIMEKEEPING -- Gitee From bf3d3eb3483fe1594772dc47bb84199d412b3492 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Tue, 19 Sep 2023 17:22:19 +0800 Subject: [PATCH 2/6] anolis: sched: introduce sysctl_sched_acpu_enabled ANBZ: #6729 In order to be able to dynamically turn on and off acpu accounting, we introduce sysctl_sched_acpu_enabled, instead of default on. Signed-off-by: Cruz Zhao --- include/linux/sched.h | 6 ------ include/linux/sched/sysctl.h | 6 ++++++ kernel/sched/core.c | 32 +++++++++++++++++++++++++++++++- kernel/smpboot.c | 1 - kernel/sysctl.c | 11 +++++++++++ 5 files changed, 48 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 260568830e3d..16eebd11f3ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2351,10 +2351,4 @@ static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif -#ifdef CONFIG_SCHED_ACPU -extern void acpu_enable(void); -#else -static inline void acpu_enable(void) { } -#endif - #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e6dbcbd03b8d..9fcea8ed4068 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -119,4 +119,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_ACPU +extern unsigned int sysctl_sched_acpu_enabled; +extern int sched_acpu_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae32d0780f26..419e0ae7d0d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -79,6 +79,7 @@ unsigned int sysctl_sched_cfs_bw_burst_enabled = 1; #ifdef CONFIG_SCHED_ACPU DEFINE_STATIC_KEY_FALSE(acpu_enabled); +unsigned int sysctl_sched_acpu_enabled; #endif /* @@ -3918,7 +3919,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ #ifdef CONFIG_SCHED_ACPU -void acpu_enable(void) +static void acpu_enable(void) { int i; @@ -3931,6 +3932,35 @@ void acpu_enable(void) static_branch_enable(&acpu_enabled); } +static void acpu_disable(void) +{ + static_branch_disable(&acpu_enabled); +} + +int sched_acpu_enable_handler(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int old, new; + + if (!write) { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + return ret; + } + + old = sysctl_sched_acpu_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + new = sysctl_sched_acpu_enabled; + if (!ret && write && (old != new)) { + if (new) + acpu_enable(); + else + acpu_disable(); + } + + return ret; +} + static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next) { const int cpu = cpu_of(rq); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index cd94ea587d3e..e4163042c4d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -73,7 +73,6 @@ void __init idle_threads_init(void) if (cpu != boot_cpu) idle_init(cpu); } - acpu_enable(); } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ec38014b9ac1..1fbce664ea31 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2038,6 +2038,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_SCHED_ACPU + { + .procname = "sched_acpu", + .data = &sysctl_sched_acpu_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_acpu_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHED_ACPU*/ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- Gitee From 74c97b025e5af384395ed4d90711f9f971077d74 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Sat, 30 Sep 2023 14:52:52 +0800 Subject: [PATCH 3/6] anolis: sched: account sibidle for core scheduling ANBZ: #6729 Accounting for sibling idle time for core scheduling, which is time where a cookie'd task running with SMT sibling idle, including sibling forced idle time and sibling real idle time, collectively called sibidle. A few details: - For SMT > 2, we scale the amount of idle charged based on the number of idle siblings in function account_sibidle_time(). Additionally, we split the time up and evenly charge it to all running tasks, as each is equally responsible forthe idle. - When core sched is enabled and sibidle count is not zero, we account sibidle in function account_sibidle_time(), otherwise in function update_acpu(). Signed-off-by: Cruz Zhao --- include/linux/kernel_stat.h | 9 +++---- include/linux/sched.h | 2 +- kernel/sched/core.c | 50 ++++++++++++++++++++++++------------- kernel/sched/core_sched.c | 38 +++++++++++++++++----------- kernel/sched/cputime.c | 19 ++++++-------- kernel/sched/sched.h | 18 ++++++------- 6 files changed, 74 insertions(+), 62 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 90d4b8c4c94d..3b3a4467e006 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -31,7 +31,7 @@ enum cpu_usage_stat { #ifdef CONFIG_SCHED_CORE CPUTIME_FORCEIDLE, #endif -#ifdef CONFIG_SCHED_ACPU +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) CPUTIME_SIBIDLE, #endif NR_STATS, @@ -122,11 +122,8 @@ extern void account_process_tick(struct task_struct *, int user); extern void account_idle_ticks(unsigned long ticks); -#ifdef CONFIG_SCHED_CORE -extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); -#endif -#ifdef CONFIG_SCHED_ACPU -extern void __account_sibidle_time(struct task_struct *tsk, u64 delta); +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) +extern void __account_sibidle_time(struct task_struct *tsk, u64 delta, bool fi); #endif #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 16eebd11f3ae..9dcfe12be015 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -500,7 +500,7 @@ struct sched_statistics { u64 core_forceidle_sum; #endif -#ifdef CONFIG_SCHED_ACPU +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU) u64 core_sibidle_sum; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 419e0ae7d0d6..f803bef95640 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -305,7 +305,7 @@ static void __sched_core_flip(bool enabled) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - cpu_rq(cpu)->core->core_forceidle_start = 0; + cpu_rq(cpu)->core->core_sibidle_start = 0; sched_core_unlock(cpu, &flags); @@ -3973,6 +3973,15 @@ static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_str if (!static_branch_likely(&acpu_enabled) || !schedstat_enabled()) return; + /* + * If core sched is enabled and core_sibidle_count is not zero, we update sibidle + * time in function __sched_core_account_sibidle(). + */ +#ifdef CONFIG_SCHED_CORE + if (rq->core->core_sibidle_count) + goto out; +#endif + /* Update idle sum and busy sum for current rq. */ delta = now - rq->last_acpu_update_time; if (prev == rq->idle) @@ -4011,7 +4020,7 @@ static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_str if (prev != rq->idle) { delta = sibidle_sum - rq->sibidle_sum; delta = delta > 0 ? delta : 0; - __account_sibidle_time(prev, delta); + __account_sibidle_time(prev, delta, false); } if (next != rq->idle) @@ -5036,18 +5045,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* reset state */ rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle_count) { + if (rq->core->core_sibidle_count) { if (!core_clock_updated) { update_rq_clock(rq->core); core_clock_updated = true; } - sched_core_account_forceidle(rq); + sched_core_account_sibidle(rq); /* reset after accounting force idle */ - rq->core->core_forceidle_start = 0; - rq->core->core_forceidle_count = 0; - rq->core->core_forceidle_occupation = 0; - need_sync = true; - fi_before = true; + rq->core->core_sibidle_start = 0; + rq->core->core_sibidle_count = 0; + rq->core->core_sibidle_occupation = 0; + if (rq->core->core_forceidle_count) { + rq->core->core_forceidle_count = 0; + need_sync = true; + fi_before = true; + } } /* @@ -5123,6 +5135,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rq_i->core_pick = p; if (p == rq_i->idle) { + rq->core->core_sibidle_count++; if (rq_i->nr_running) { rq->core->core_forceidle_count++; if (!fi_before) @@ -5133,9 +5146,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } - if (schedstat_enabled() && rq->core->core_forceidle_count) { - rq->core->core_forceidle_start = rq_clock(rq->core); - rq->core->core_forceidle_occupation = occ; + if (schedstat_enabled() && rq->core->core_sibidle_count) { + rq->core->core_sibidle_start = rq_clock(rq->core); + rq->core->core_sibidle_occupation = occ; } rq->core->core_pick_seq = rq->core->core_task_seq; @@ -5177,7 +5190,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!(fi_before && rq->core->core_forceidle_count)) task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); - rq_i->core_pick->core_occupation = occ; + if (rq->core->core_forceidle_count) + rq_i->core_pick->core_occupation = occ; if (i == cpu) { rq_i->core_pick = NULL; @@ -5397,14 +5411,15 @@ static void sched_core_cpu_deactivate(unsigned int cpu) core_rq->core_cookie = rq->core_cookie; core_rq->core_forceidle_count = rq->core_forceidle_count; core_rq->core_forceidle_seq = rq->core_forceidle_seq; - core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + core_rq->core_sibidle_occupation = rq->core_sibidle_occupation; + core_rq->core_sibidle_count = rq->core_sibidle_count; /* * Accounting edge for forced idle is handled in pick_next_task(). * Don't need another one here, since the hotplug thread shouldn't * have a cookie. */ - core_rq->core_forceidle_start = 0; + core_rq->core_sibidle_start = 0; /* install new leader */ for_each_cpu(t, smt_mask) { @@ -8305,8 +8320,9 @@ void __init sched_init(void) rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; - rq->core_forceidle_occupation = 0; - rq->core_forceidle_start = 0; + rq->core_sibidle_count = 0; + rq->core_sibidle_occupation = 0; + rq->core_sibidle_start = 0; rq->core_cookie = 0UL; #endif diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 18c5b298b481..19654cc5c113 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -304,7 +304,7 @@ int sysctl_sched_core_handler(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS /* REQUIRES: rq->core's clock recently updated. */ -void __sched_core_account_forceidle(struct rq *rq) +void __sched_core_account_sibidle(struct rq *rq) { const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); u64 delta, now = rq_clock(rq->core); @@ -314,28 +314,28 @@ void __sched_core_account_forceidle(struct rq *rq) lockdep_assert_rq_held(rq); - WARN_ON_ONCE(!rq->core->core_forceidle_count); + WARN_ON_ONCE(!rq->core->core_sibidle_count); - if (rq->core->core_forceidle_start == 0) - return; + if (rq->core->core_sibidle_start == 0) + goto out; - delta = now - rq->core->core_forceidle_start; + delta = now - rq->core->core_sibidle_start; if (unlikely((s64)delta <= 0)) - return; + goto out; - rq->core->core_forceidle_start = now; + rq->core->core_sibidle_start = now; - if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + if (WARN_ON_ONCE(!rq->core->core_sibidle_occupation)) { /* can't be forced idle without a running task */ - } else if (rq->core->core_forceidle_count > 1 || - rq->core->core_forceidle_occupation > 1) { + } else if (rq->core->core_sibidle_count > 1 || + rq->core->core_sibidle_occupation > 1) { /* * For larger SMT configurations, we need to scale the charged * forced idle amount since there can be more than one forced * idle sibling and more than one running cookied task. */ - delta *= rq->core->core_forceidle_count; - delta = div_u64(delta, rq->core->core_forceidle_occupation); + delta *= rq->core->core_sibidle_count; + delta = div_u64(delta, rq->core->core_sibidle_occupation); } for_each_cpu(i, smt_mask) { @@ -349,19 +349,27 @@ void __sched_core_account_forceidle(struct rq *rq) * Note: this will account forceidle to the current cpu, even * if it comes from our SMT sibling. */ - __account_forceidle_time(p, delta); + __account_sibidle_time(p, delta, !!rq->core->core_forceidle_count); + } + +out: +#ifdef CONFIG_SCHED_ACPU + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + rq->last_acpu_update_time = now; } +#endif } void __sched_core_tick(struct rq *rq) { - if (!rq->core->core_forceidle_count) + if (!rq->core->core_sibidle_count) return; if (rq != rq->core) update_rq_clock(rq->core); - __sched_core_account_forceidle(rq); + __sched_core_account_sibidle(rq); } #endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 56b446412467..8c7221be42db 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -225,25 +225,20 @@ void account_idle_time(u64 cputime) } -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU) /* - * Account for forceidle time due to core scheduling. + * Account for sibidle, and for forceidle time due to core scheduling. * * REQUIRES: schedstat is enabled. */ -void __account_forceidle_time(struct task_struct *p, u64 delta) -{ - __schedstat_add(p->se.statistics.core_forceidle_sum, delta); - - task_group_account_field(p, CPUTIME_FORCEIDLE, delta); -} -#endif -#ifdef CONFIG_SCHED_ACPU -void __account_sibidle_time(struct task_struct *p, u64 delta) +void __account_sibidle_time(struct task_struct *p, u64 delta, bool fi) { __schedstat_add(p->se.statistics.core_sibidle_sum, delta); - task_group_account_field(p, CPUTIME_SIBIDLE, delta); + if (fi) { + __schedstat_add(p->se.statistics.core_forceidle_sum, delta); + task_group_account_field(p, CPUTIME_FORCEIDLE, delta); + } } #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c27595620fb6..a8a13adbf7af 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1307,15 +1307,11 @@ struct rq { unsigned long core_cookie; unsigned int core_forceidle_count; unsigned int core_forceidle_seq; - unsigned int core_forceidle_occupation; - u64 core_forceidle_start; + unsigned int core_sibidle_occupation; + u64 core_sibidle_start; unsigned int core_id; - unsigned int core_realidle_count; - unsigned int core_realidle_occupation; - u64 core_realidle_start; - u64 rq_realidle_time; + unsigned int core_sibidle_count; bool in_forceidle; - bool in_realidle; struct task_struct *force_idled_core_pick; #endif @@ -2102,12 +2098,12 @@ static inline void flush_smp_call_function_from_idle(void) { } #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) -extern void __sched_core_account_forceidle(struct rq *rq); +extern void __sched_core_account_sibidle(struct rq *rq); -static inline void sched_core_account_forceidle(struct rq *rq) +static inline void sched_core_account_sibidle(struct rq *rq) { if (schedstat_enabled()) - __sched_core_account_forceidle(rq); + __sched_core_account_sibidle(rq); } extern void __sched_core_tick(struct rq *rq); @@ -2120,7 +2116,7 @@ static inline void sched_core_tick(struct rq *rq) #else -static inline void sched_core_account_forceidle(struct rq *rq) {} +static inline void sched_core_account_sibidle(struct rq *rq) {} static inline void sched_core_tick(struct rq *rq) {} -- Gitee From e615948eabcd632b5f4028547790afc1a5fc5663 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Mon, 18 Sep 2023 10:24:20 +0800 Subject: [PATCH 4/6] anolis: sched: introduce per cgroup sibidle accounting ANBZ: #6729 This patch extends per task sibidle accounting into cgroups. rstat is used for cgroup accounting, except for the root, which uses kcpustat in order to bypass the need for doing an rstat flush when reading root stats. Data is displayed via /sys/fs/cgroup/cpu//cpu.stat, row sibidle_sum. Similar to the task accounting, the cgroup accounting requires that schedstats is enabled. Signed-off-by: Cruz Zhao --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup/rstat.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 24884a4626c9..d702ed7d2268 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -323,6 +323,9 @@ struct cgroup_base_stat { #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + u64 sibidle_sum; +#endif }; /* diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 80bf45524200..cba6fdc3545d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -302,6 +302,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + dst_bstat->sibidle_sum += src_bstat->sibidle_sum; +#endif } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -313,6 +316,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + dst_bstat->sibidle_sum -= src_bstat->sibidle_sum; +#endif } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -391,6 +397,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, case CPUTIME_FORCEIDLE: rstatc->bstat.forceidle_sum += delta_exec; break; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + case CPUTIME_SIBIDLE: + rstatc->bstat.sibidle_sum += delta_exec; + break; #endif default: break; @@ -434,6 +445,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + bstat->sibidle_sum += cpustat[CPUTIME_SIBIDLE]; #endif } } @@ -446,6 +460,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE u64 forceidle_time; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + u64 sibidle_time; +#endif if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); @@ -454,6 +471,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) &utime, &stime); #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + sibidle_time = cgrp->bstat.sibidle_sum; #endif cgroup_rstat_flush_release(); } else { @@ -463,6 +483,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) stime = bstat.cputime.stime; #ifdef CONFIG_SCHED_CORE forceidle_time = bstat.forceidle_sum; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + sibidle_time = bstat.sibidle_sum; #endif } @@ -472,6 +495,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE do_div(forceidle_time, NSEC_PER_USEC); #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + do_div(sibidle_time, NSEC_PER_USEC); +#endif seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" @@ -481,4 +507,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + seq_printf(seq, "sibidle_usec %llu\n", sibidle_time); +#endif } -- Gitee From e815f6e37203b469c3129818443f3e226bfe34ac Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:20 -0700 Subject: [PATCH 5/6] cgroup: rstat: support cgroup1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #6729 commit a7df69b81aac5bdeb5c5aef9addd680ce22feebf upstream. Rstat currently only supports the default hierarchy in cgroup2. In order to replace memcg's private stats infrastructure - used in both cgroup1 and cgroup2 - with rstat, the latter needs to support cgroup1. The initialization and destruction callbacks for regular cgroups are already in place. Remove the cgroup_on_dfl() guards to handle cgroup1. The initialization of the root cgroup is currently hardcoded to only handle cgrp_dfl_root.cgrp. Move those callbacks to cgroup_setup_root() and cgroup_destroy_root() to handle the default root as well as the various cgroup1 roots we may set up during mounting. The linking of css to cgroups happens in code shared between cgroup1 and cgroup2 as well. Simply remove the cgroup_on_dfl() guard. Linkage of the root css to the root cgroup is a bit trickier: per default, the root css of a subsystem controller belongs to the default hierarchy (i.e. the cgroup2 root). When a controller is mounted in its cgroup1 version, the root css is stolen and moved to the cgroup1 root; on unmount, the css moves back to the default hierarchy. Annotate rebind_subsystems() to move the root css linkage along between roots. Link: https://lkml.kernel.org/r/20210209163304.77088-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Tejun Heo Reviewed-by: Michal Koutný Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Cruz Zhao --- kernel/cgroup/cgroup.c | 34 +++++++++++++++++++++------------- kernel/cgroup/rstat.c | 2 -- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5b1fdfd2fd4c..7fb89841ab57 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1351,6 +1351,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_unlock(&cgroup_mutex); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1786,6 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -2010,10 +2017,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); @@ -2045,6 +2056,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -5119,8 +5132,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5161,8 +5173,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5219,7 +5230,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5353,11 +5364,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); @@ -5444,8 +5453,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index cba6fdc3545d..ab83d654b7df 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -285,8 +285,6 @@ void __init cgroup_rstat_boot(void) for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } /* -- Gitee From 76730757c42b4b1d6a52667787eed373f172c8fe Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Tue, 28 Feb 2023 09:42:36 +0000 Subject: [PATCH 6/6] anolis: sched/core: introduce ht-aware-quota ANBZ: #6729 With acpu accounting, we are able to assess how long the task is running with sibling idle and how long with sibling busy. To make the computing power of tasks stable, we need the tasks to execute a similar number of instructions in each scheduling cycle. To achieve this goal, we introduce ht-aware-quota. When task is running with sibling idle, we consider the task to have execute more instructions, with a certain ratio, and the sibling idle time * ratio will be account its cfs_rq_runtime, not just siblind idle time. The ratio can be configured from /sys/fs/cgroup//cpu.ht_ratio, unit: percentage, range: [100, 200], default: 100. As for now, ht-aware-quota is only valid for cookie'd tasks, as when the sibling is busy, we know what task is running. And sched_feat SCHED_CORE_HT_AWARE_QUOTA is required to be enable. Signed-off-by: Cruz Zhao --- include/linux/sched.h | 2 +- kernel/sched/core.c | 49 +++++++++++++++++++++++++++++++++++++++ kernel/sched/core_sched.c | 1 + kernel/sched/fair.c | 22 ++++++++++++++++++ kernel/sched/features.h | 4 ++++ kernel/sched/sched.h | 4 ++++ 6 files changed, 81 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9dcfe12be015..3dad0518a36b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -579,7 +579,7 @@ struct sched_entity { #ifdef CONFIG_SCHED_CORE u64 core_vruntime; - unsigned int ht_aware_quota_coefficient; + unsigned int ht_ratio; #endif CK_KABI_RESERVE(1) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f803bef95640..f0d6fa895d03 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8604,6 +8604,9 @@ struct task_group *sched_create_group(struct task_group *parent) alloc_uclamp_sched_group(tg, parent); +#ifdef CONFIG_SCHED_CORE + tg->ht_ratio = 100; +#endif return tg; err: @@ -9534,6 +9537,38 @@ static s64 cpu_identity_read_s64(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_CORE +static int cpu_ht_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 ht_ratio) +{ + struct task_group *tg = css_tg(css); + int cpu; + + if (ht_ratio < 100 || ht_ratio > 200) + return -1; + + if (tg == &root_task_group) + return -1; + + tg->ht_ratio = ht_ratio; + for_each_online_cpu(cpu) { + struct sched_entity *se = tg->se[cpu]; + + se->ht_ratio = ht_ratio; + } + + return 0; +} + +static u64 cpu_ht_ratio_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->ht_ratio; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -9641,6 +9676,13 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_ht_stable_read_u64, .write_u64 = cpu_ht_stable_write_u64, }, +#endif +#ifdef CONFIG_SCHED_CORE + { + .name = "ht_ratio", + .read_u64 = cpu_ht_ratio_read, + .write_u64 = cpu_ht_ratio_write, + }, #endif { } /* Terminate */ }; @@ -10204,6 +10246,13 @@ static struct cftype cpu_files[] = { .write_u64 = sched_lat_stat_write, .seq_show = sched_lat_stat_show }, +#endif +#ifdef CONFIG_SCHED_CORE + { + .name = "ht_ratio", + .read_u64 = cpu_ht_ratio_read, + .write_u64 = cpu_ht_ratio_write, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 19654cc5c113..bc1f23bc14c5 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -350,6 +350,7 @@ void __sched_core_account_sibidle(struct rq *rq) * if it comes from our SMT sibling. */ __account_sibidle_time(p, delta, !!rq->core->core_forceidle_count); + account_ht_aware_quota(p, delta); } out: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 719c6e071d25..61d0c89a1f2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13015,6 +13015,25 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } + +void account_ht_aware_quota(struct task_struct *p, u64 delta) +{ + struct sched_entity *se; + unsigned int ht_ratio; + struct cfs_rq *cfs_rq; + + /* We only account ht_aware_quota for cookied task. */ + if (sched_feat(SCHED_CORE_HT_AWARE_QUOTA) && p->core_cookie) { + se = &p->se; + cfs_rq = task_cfs_rq(p); + + if (se->parent) { + ht_ratio = se->parent->ht_ratio; + if (ht_ratio >= 100 && ht_ratio <= 200) + account_cfs_rq_runtime(cfs_rq, delta * (ht_ratio - 100) / 100); + } + } +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif @@ -13448,6 +13467,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); +#ifdef CONFIG_SCHED_CORE + se->ht_ratio = 100; +#endif } return 1; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 79f43319ac79..1163f14f24a0 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -106,3 +106,7 @@ SCHED_FEAT(ID_LAST_HIGHCLASS_STAY, false) SCHED_FEAT(ID_LOOSE_EXPEL, false) SCHED_FEAT(ID_EXPELLER_SHARE_CORE, true) #endif + +#ifdef CONFIG_SCHED_CORE +SCHED_FEAT(SCHED_CORE_HT_AWARE_QUOTA, false) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a8a13adbf7af..bd65a2707cbc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -545,6 +545,9 @@ struct task_group { struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu; #endif +#ifdef CONFIG_SCHED_CORE + unsigned int ht_ratio; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -1485,6 +1488,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); +extern void account_ht_aware_quota(struct task_struct *p, u64 delta); #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) -- Gitee