diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 24884a4626c943d1f4786fff225b40f7bc142731..d702ed7d2268f725c5d111da5c500760cd847eba 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -323,6 +323,9 @@ struct cgroup_base_stat { #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + u64 sibidle_sum; +#endif }; /* diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index ca7ac6734c414a99e3d4a9439581960596890f35..3b3a4467e006a6af9869d2fcbfd20b284b14fb53 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -30,6 +30,9 @@ enum cpu_usage_stat { CPUTIME_GUEST_NICE, #ifdef CONFIG_SCHED_CORE CPUTIME_FORCEIDLE, +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + CPUTIME_SIBIDLE, #endif NR_STATS, }; @@ -119,8 +122,8 @@ extern void account_process_tick(struct task_struct *, int user); extern void account_idle_ticks(unsigned long ticks); -#ifdef CONFIG_SCHED_CORE -extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) +extern void __account_sibidle_time(struct task_struct *tsk, u64 delta, bool fi); #endif #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5b04709f47b2b73e4380f7b357d276329e141012..3dad0518a36b4fdd389fc39aabcf3d96537cd141 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -500,6 +500,10 @@ struct sched_statistics { u64 core_forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU) + u64 core_sibidle_sum; +#endif + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -575,7 +579,7 @@ struct sched_entity { #ifdef CONFIG_SCHED_CORE u64 core_vruntime; - unsigned int ht_aware_quota_coefficient; + unsigned int ht_ratio; #endif CK_KABI_RESERVE(1) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e6dbcbd03b8de7db1745ea7025fd5e534af604a7..9fcea8ed40686ff752f0e908bce97e2d814bf8b3 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -119,4 +119,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_ACPU +extern unsigned int sysctl_sched_acpu_enabled; +extern int sched_acpu_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5b1fdfd2fd4cf9ae20618d211f21c0e4662a6ba1..7fb89841ab57532fc1017ec3d6b407815b857bef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1351,6 +1351,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_unlock(&cgroup_mutex); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1786,6 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -2010,10 +2017,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); @@ -2045,6 +2056,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -5119,8 +5132,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5161,8 +5173,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5219,7 +5230,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5353,11 +5364,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); @@ -5444,8 +5453,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 80bf455242001dfedd48794c54ee7dc69e9aea6f..ab83d654b7df92b96865edc0de77fbd44af6af11 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -285,8 +285,6 @@ void __init cgroup_rstat_boot(void) for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } /* @@ -302,6 +300,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + dst_bstat->sibidle_sum += src_bstat->sibidle_sum; +#endif } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -313,6 +314,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + dst_bstat->sibidle_sum -= src_bstat->sibidle_sum; +#endif } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -391,6 +395,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, case CPUTIME_FORCEIDLE: rstatc->bstat.forceidle_sum += delta_exec; break; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + case CPUTIME_SIBIDLE: + rstatc->bstat.sibidle_sum += delta_exec; + break; #endif default: break; @@ -434,6 +443,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + bstat->sibidle_sum += cpustat[CPUTIME_SIBIDLE]; #endif } } @@ -446,6 +458,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE u64 forceidle_time; #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + u64 sibidle_time; +#endif if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); @@ -454,6 +469,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) &utime, &stime); #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + sibidle_time = cgrp->bstat.sibidle_sum; #endif cgroup_rstat_flush_release(); } else { @@ -463,6 +481,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) stime = bstat.cputime.stime; #ifdef CONFIG_SCHED_CORE forceidle_time = bstat.forceidle_sum; +#endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + sibidle_time = bstat.sibidle_sum; #endif } @@ -472,6 +493,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE do_div(forceidle_time, NSEC_PER_USEC); #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + do_div(sibidle_time, NSEC_PER_USEC); +#endif seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" @@ -481,4 +505,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE) + seq_printf(seq, "sibidle_usec %llu\n", sibidle_time); +#endif } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a55f28d876d00bfad3c17f83556a658256f3e4e9..f0d6fa895d033c27c77b386bb280341a456118aa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -77,6 +77,11 @@ unsigned int sysctl_sched_cfs_bw_burst_onset_percent; unsigned int sysctl_sched_cfs_bw_burst_enabled = 1; #endif +#ifdef CONFIG_SCHED_ACPU +DEFINE_STATIC_KEY_FALSE(acpu_enabled); +unsigned int sysctl_sched_acpu_enabled; +#endif + /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -300,7 +305,7 @@ static void __sched_core_flip(bool enabled) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - cpu_rq(cpu)->core->core_forceidle_start = 0; + cpu_rq(cpu)->core->core_sibidle_start = 0; sched_core_unlock(cpu, &flags); @@ -3913,6 +3918,122 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +#ifdef CONFIG_SCHED_ACPU +static void acpu_enable(void) +{ + int i; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + /* It may be not that accurate, but useful enough. */ + rq->last_acpu_update_time = rq->clock; + } + static_branch_enable(&acpu_enabled); +} + +static void acpu_disable(void) +{ + static_branch_disable(&acpu_enabled); +} + +int sched_acpu_enable_handler(struct ctl_table *table, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int old, new; + + if (!write) { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + return ret; + } + + old = sysctl_sched_acpu_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + new = sysctl_sched_acpu_enabled; + if (!ret && write && (old != new)) { + if (new) + acpu_enable(); + else + acpu_disable(); + } + + return ret; +} + +static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + const int cpu = cpu_of(rq); + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + u64 now = rq_clock(rq); + u64 sibidle_sum, last_update_time; + s64 delta, last; + int i; + + if (!static_branch_likely(&acpu_enabled) || !schedstat_enabled()) + return; + + /* + * If core sched is enabled and core_sibidle_count is not zero, we update sibidle + * time in function __sched_core_account_sibidle(). + */ +#ifdef CONFIG_SCHED_CORE + if (rq->core->core_sibidle_count) + goto out; +#endif + + /* Update idle sum and busy sum for current rq. */ + delta = now - rq->last_acpu_update_time; + if (prev == rq->idle) + rq->acpu_idle_sum += delta; + + /* + * Be carefule, smt_mask maybe NULL. + * We only consider the case where there are two SMT at this stage. + */ + if (unlikely(!smt_mask) || unlikely(cpumask_weight(smt_mask) != 2)) + goto out; + + for_each_cpu(i, smt_mask) { + if (i != cpu) { + struct rq *rq_i = cpu_rq(i); + struct task_struct *curr_i = rq_i->curr; + + last = (s64)(rq->last_acpu_update_time - + rq_i->last_acpu_update_time); + last_update_time = last >= 0 ? rq->last_acpu_update_time : + rq_i->last_acpu_update_time; + /* + * Sibling may update acpu at the same time, and it's + * timestamp may be newer than this rq. + */ + delta = now - last_update_time; + delta = delta > 0 ? delta : 0; + + /* Add the delta to improve accuracy. */ + sibidle_sum = last >= 0 ? rq->sibidle_sum : rq_i->acpu_idle_sum; + if (curr_i == rq_i->idle) + sibidle_sum += delta; + } + } + + if (prev != rq->idle) { + delta = sibidle_sum - rq->sibidle_sum; + delta = delta > 0 ? delta : 0; + __account_sibidle_time(prev, delta, false); + } + + if (next != rq->idle) + rq->sibidle_sum = sibidle_sum; +out: + rq->last_acpu_update_time = now; +} +#else +static inline void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ +} +#endif /* CONFIG_SCHED_ACPU */ + static inline void prepare_task(struct task_struct *next) { #ifdef CONFIG_SMP @@ -4004,6 +4125,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, { kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); + update_acpu(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); @@ -4456,6 +4578,7 @@ void scheduler_tick(void) thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + update_acpu(rq, curr, curr); calc_global_load_tick(rq); psi_task_tick(rq); sched_core_tick(rq); @@ -4922,18 +5045,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* reset state */ rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle_count) { + if (rq->core->core_sibidle_count) { if (!core_clock_updated) { update_rq_clock(rq->core); core_clock_updated = true; } - sched_core_account_forceidle(rq); + sched_core_account_sibidle(rq); /* reset after accounting force idle */ - rq->core->core_forceidle_start = 0; - rq->core->core_forceidle_count = 0; - rq->core->core_forceidle_occupation = 0; - need_sync = true; - fi_before = true; + rq->core->core_sibidle_start = 0; + rq->core->core_sibidle_count = 0; + rq->core->core_sibidle_occupation = 0; + if (rq->core->core_forceidle_count) { + rq->core->core_forceidle_count = 0; + need_sync = true; + fi_before = true; + } } /* @@ -5009,6 +5135,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rq_i->core_pick = p; if (p == rq_i->idle) { + rq->core->core_sibidle_count++; if (rq_i->nr_running) { rq->core->core_forceidle_count++; if (!fi_before) @@ -5019,9 +5146,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } - if (schedstat_enabled() && rq->core->core_forceidle_count) { - rq->core->core_forceidle_start = rq_clock(rq->core); - rq->core->core_forceidle_occupation = occ; + if (schedstat_enabled() && rq->core->core_sibidle_count) { + rq->core->core_sibidle_start = rq_clock(rq->core); + rq->core->core_sibidle_occupation = occ; } rq->core->core_pick_seq = rq->core->core_task_seq; @@ -5063,7 +5190,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!(fi_before && rq->core->core_forceidle_count)) task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); - rq_i->core_pick->core_occupation = occ; + if (rq->core->core_forceidle_count) + rq_i->core_pick->core_occupation = occ; if (i == cpu) { rq_i->core_pick = NULL; @@ -5283,14 +5411,15 @@ static void sched_core_cpu_deactivate(unsigned int cpu) core_rq->core_cookie = rq->core_cookie; core_rq->core_forceidle_count = rq->core_forceidle_count; core_rq->core_forceidle_seq = rq->core_forceidle_seq; - core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + core_rq->core_sibidle_occupation = rq->core_sibidle_occupation; + core_rq->core_sibidle_count = rq->core_sibidle_count; /* * Accounting edge for forced idle is handled in pick_next_task(). * Don't need another one here, since the hotplug thread shouldn't * have a cookie. */ - core_rq->core_forceidle_start = 0; + core_rq->core_sibidle_start = 0; /* install new leader */ for_each_cpu(t, smt_mask) { @@ -8171,6 +8300,12 @@ void __init sched_init(void) rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ + +#ifdef CONFIG_SCHED_ACPU + rq->acpu_idle_sum = 0; + rq->sibidle_sum = 0; + rq->last_acpu_update_time = rq->clock; +#endif hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); #if defined(CONFIG_GROUP_IDENTITY) && defined(CONFIG_SCHED_SMT) @@ -8185,8 +8320,9 @@ void __init sched_init(void) rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; - rq->core_forceidle_occupation = 0; - rq->core_forceidle_start = 0; + rq->core_sibidle_count = 0; + rq->core_sibidle_occupation = 0; + rq->core_sibidle_start = 0; rq->core_cookie = 0UL; #endif @@ -8468,6 +8604,9 @@ struct task_group *sched_create_group(struct task_group *parent) alloc_uclamp_sched_group(tg, parent); +#ifdef CONFIG_SCHED_CORE + tg->ht_ratio = 100; +#endif return tg; err: @@ -9398,6 +9537,38 @@ static s64 cpu_identity_read_s64(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_CORE +static int cpu_ht_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 ht_ratio) +{ + struct task_group *tg = css_tg(css); + int cpu; + + if (ht_ratio < 100 || ht_ratio > 200) + return -1; + + if (tg == &root_task_group) + return -1; + + tg->ht_ratio = ht_ratio; + for_each_online_cpu(cpu) { + struct sched_entity *se = tg->se[cpu]; + + se->ht_ratio = ht_ratio; + } + + return 0; +} + +static u64 cpu_ht_ratio_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->ht_ratio; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -9505,6 +9676,13 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_ht_stable_read_u64, .write_u64 = cpu_ht_stable_write_u64, }, +#endif +#ifdef CONFIG_SCHED_CORE + { + .name = "ht_ratio", + .read_u64 = cpu_ht_ratio_read, + .write_u64 = cpu_ht_ratio_write, + }, #endif { } /* Terminate */ }; @@ -10068,6 +10246,13 @@ static struct cftype cpu_files[] = { .write_u64 = sched_lat_stat_write, .seq_show = sched_lat_stat_show }, +#endif +#ifdef CONFIG_SCHED_CORE + { + .name = "ht_ratio", + .read_u64 = cpu_ht_ratio_read, + .write_u64 = cpu_ht_ratio_write, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 18c5b298b4811a23505b997f9591ba175b683f7c..bc1f23bc14c564434aedb11e460556646987432e 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -304,7 +304,7 @@ int sysctl_sched_core_handler(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS /* REQUIRES: rq->core's clock recently updated. */ -void __sched_core_account_forceidle(struct rq *rq) +void __sched_core_account_sibidle(struct rq *rq) { const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); u64 delta, now = rq_clock(rq->core); @@ -314,28 +314,28 @@ void __sched_core_account_forceidle(struct rq *rq) lockdep_assert_rq_held(rq); - WARN_ON_ONCE(!rq->core->core_forceidle_count); + WARN_ON_ONCE(!rq->core->core_sibidle_count); - if (rq->core->core_forceidle_start == 0) - return; + if (rq->core->core_sibidle_start == 0) + goto out; - delta = now - rq->core->core_forceidle_start; + delta = now - rq->core->core_sibidle_start; if (unlikely((s64)delta <= 0)) - return; + goto out; - rq->core->core_forceidle_start = now; + rq->core->core_sibidle_start = now; - if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + if (WARN_ON_ONCE(!rq->core->core_sibidle_occupation)) { /* can't be forced idle without a running task */ - } else if (rq->core->core_forceidle_count > 1 || - rq->core->core_forceidle_occupation > 1) { + } else if (rq->core->core_sibidle_count > 1 || + rq->core->core_sibidle_occupation > 1) { /* * For larger SMT configurations, we need to scale the charged * forced idle amount since there can be more than one forced * idle sibling and more than one running cookied task. */ - delta *= rq->core->core_forceidle_count; - delta = div_u64(delta, rq->core->core_forceidle_occupation); + delta *= rq->core->core_sibidle_count; + delta = div_u64(delta, rq->core->core_sibidle_occupation); } for_each_cpu(i, smt_mask) { @@ -349,19 +349,28 @@ void __sched_core_account_forceidle(struct rq *rq) * Note: this will account forceidle to the current cpu, even * if it comes from our SMT sibling. */ - __account_forceidle_time(p, delta); + __account_sibidle_time(p, delta, !!rq->core->core_forceidle_count); + account_ht_aware_quota(p, delta); + } + +out: +#ifdef CONFIG_SCHED_ACPU + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + rq->last_acpu_update_time = now; } +#endif } void __sched_core_tick(struct rq *rq) { - if (!rq->core->core_forceidle_count) + if (!rq->core->core_sibidle_count) return; if (rq != rq->core) update_rq_clock(rq->core); - __sched_core_account_forceidle(rq); + __sched_core_account_sibidle(rq); } #endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 48a0a4fa02ef4405c6d8c38acfff468dcd0a361f..8c7221be42db604df745e7d12672af8f22657feb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -225,17 +225,20 @@ void account_idle_time(u64 cputime) } -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU) /* - * Account for forceidle time due to core scheduling. + * Account for sibidle, and for forceidle time due to core scheduling. * * REQUIRES: schedstat is enabled. */ -void __account_forceidle_time(struct task_struct *p, u64 delta) +void __account_sibidle_time(struct task_struct *p, u64 delta, bool fi) { - __schedstat_add(p->se.statistics.core_forceidle_sum, delta); - - task_group_account_field(p, CPUTIME_FORCEIDLE, delta); + __schedstat_add(p->se.statistics.core_sibidle_sum, delta); + task_group_account_field(p, CPUTIME_SIBIDLE, delta); + if (fi) { + __schedstat_add(p->se.statistics.core_forceidle_sum, delta); + task_group_account_field(p, CPUTIME_FORCEIDLE, delta); + } } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d30ea76022e95dfda54cfd383f4056c63664803..3e1fd969ff9b8ab83cdfb5e95cabe392f3aba755 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1025,6 +1025,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_CORE PN_SCHEDSTAT(se.statistics.core_forceidle_sum); +#endif +#ifdef CONFIG_SCHED_ACPU + PN_SCHEDSTAT(se.statistics.core_sibidle_sum); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 719c6e071d257daf082c556fb0ff69a9a47c0eec..61d0c89a1f2ac0a6153b71555075f7fa68cf1cc1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13015,6 +13015,25 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } + +void account_ht_aware_quota(struct task_struct *p, u64 delta) +{ + struct sched_entity *se; + unsigned int ht_ratio; + struct cfs_rq *cfs_rq; + + /* We only account ht_aware_quota for cookied task. */ + if (sched_feat(SCHED_CORE_HT_AWARE_QUOTA) && p->core_cookie) { + se = &p->se; + cfs_rq = task_cfs_rq(p); + + if (se->parent) { + ht_ratio = se->parent->ht_ratio; + if (ht_ratio >= 100 && ht_ratio <= 200) + account_cfs_rq_runtime(cfs_rq, delta * (ht_ratio - 100) / 100); + } + } +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif @@ -13448,6 +13467,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); +#ifdef CONFIG_SCHED_CORE + se->ht_ratio = 100; +#endif } return 1; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 79f43319ac7996bcc25bdef10f15018219d00730..1163f14f24a03e0ba99eb486415229f8a83a7085 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -106,3 +106,7 @@ SCHED_FEAT(ID_LAST_HIGHCLASS_STAY, false) SCHED_FEAT(ID_LOOSE_EXPEL, false) SCHED_FEAT(ID_EXPELLER_SHARE_CORE, true) #endif + +#ifdef CONFIG_SCHED_CORE +SCHED_FEAT(SCHED_CORE_HT_AWARE_QUOTA, false) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 448c3fd61680d9dd6f15a0d1f0067c5461959607..bd65a2707cbc9adb6d9bf93ba0caca355df081f6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -545,6 +545,9 @@ struct task_group { struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu; #endif +#ifdef CONFIG_SCHED_CORE + unsigned int ht_ratio; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -1307,18 +1310,20 @@ struct rq { unsigned long core_cookie; unsigned int core_forceidle_count; unsigned int core_forceidle_seq; - unsigned int core_forceidle_occupation; - u64 core_forceidle_start; + unsigned int core_sibidle_occupation; + u64 core_sibidle_start; unsigned int core_id; - unsigned int core_realidle_count; - unsigned int core_realidle_occupation; - u64 core_realidle_start; - u64 rq_realidle_time; + unsigned int core_sibidle_count; bool in_forceidle; - bool in_realidle; struct task_struct *force_idled_core_pick; #endif +#ifdef CONFIG_SCHED_ACPU + u64 acpu_idle_sum; + u64 sibidle_sum; + u64 last_acpu_update_time; +#endif + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -1483,6 +1488,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); +extern void account_ht_aware_quota(struct task_struct *p, u64 delta); #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) @@ -2096,12 +2102,12 @@ static inline void flush_smp_call_function_from_idle(void) { } #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) -extern void __sched_core_account_forceidle(struct rq *rq); +extern void __sched_core_account_sibidle(struct rq *rq); -static inline void sched_core_account_forceidle(struct rq *rq) +static inline void sched_core_account_sibidle(struct rq *rq) { if (schedstat_enabled()) - __sched_core_account_forceidle(rq); + __sched_core_account_sibidle(rq); } extern void __sched_core_tick(struct rq *rq); @@ -2114,7 +2120,7 @@ static inline void sched_core_tick(struct rq *rq) #else -static inline void sched_core_account_forceidle(struct rq *rq) {} +static inline void sched_core_account_sibidle(struct rq *rq) {} static inline void sched_core_tick(struct rq *rq) {} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ec38014b9ac18df482d8c37cb115687605f06459..1fbce664ea314098d6a8ff00725b637936b2e1e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2038,6 +2038,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_SCHED_ACPU + { + .procname = "sched_acpu", + .data = &sysctl_sched_acpu_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_acpu_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHED_ACPU*/ #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7c63c6787adfa0e13ca8581f16bcefbc8273a31c..6684c5a2e71f730074cf035a34c74b1b88c0a4a8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1132,6 +1132,13 @@ config SCHEDSTATS_HOST this config. If Y here, the default value of it is 1, and if N, the value is 0. +config SCHED_ACPU + bool "ACPU info: account idle time of smt to task" + depends on DEBUG_KERNEL && PROC_FS && SMP && SCHED_SMT + default y + help + Add ACPU info in /proc//sched. + endmenu config DEBUG_TIMEKEEPING