From a0c5474f1831ef5f9f5e3331b5b6e42d848b89b9 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Mon, 10 Nov 2025 21:04:55 +0800 Subject: [PATCH 01/11] anolis: sched: support Group Balancer to be aware of cpuset ANBZ: #8765 When attach a task group to a group balancer sched domain, check whether the intersection of gb_sd->span and cpuset->cpus_allowed (if the cgroup has cpuset), satisfies the quota of the task group. When the cpuset of cgroup changes, validate whether the group balancer sched domain still satisfies. Signed-off-by: CruzZhao --- include/linux/sched.h | 20 +++++++++++++ kernel/cgroup/cpuset.c | 41 ++++++++++++++++++++++++++ kernel/sched/core.c | 10 +++++++ kernel/sched/group_balancer.c | 54 +++++++++++++++++++++++++++++------ kernel/sched/sched.h | 8 ------ 5 files changed, 117 insertions(+), 16 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a1d6559bdb1d..89913709a766 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -66,6 +66,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct io_uring_task; +struct cgroup; /* * Task state bitmask. NOTE! These bits are also @@ -2424,4 +2425,23 @@ static inline bool jbd2_proxy_exec_disabled(void) { return !static_branch_unlikely(&__jbd2_proxy_exec_enabled); } +#ifdef CONFIG_GROUP_BALANCER +extern bool group_balancer_enabled(void); +extern void tg_specs_change(struct task_group *tg); +extern bool tg_group_balancer_enabled(struct task_group *tg); +extern struct task_group *cgroup_tg(struct cgroup *cgrp); +extern struct cgroup *tg_cgroup(struct task_group *tg); +extern void lock_cfs_constraints_mutex(void); +extern void unlock_cfs_constraints_mutex(void); +#ifdef CONFIG_CPUSETS +extern struct cpumask *task_group_cpus_allowed(struct task_group *tg); +#else +static inline struct cpumask *task_group_cpus_allowed(struct task_group *tg) +{ + return NULL; +} +#endif +#else +static inline void tg_specs_change(struct task_group *tg) { } +#endif #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 164f5bee99da..bb9da7a87371 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -205,6 +205,46 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuset, css) : NULL; } +#ifdef CONFIG_GROUP_BALANCER +static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) +{ + return container_of(global_cgroup_css(cgrp, cpuset_cgrp_id), + struct cpuset, css); +} + +struct cpumask *task_group_cpus_allowed(struct task_group *tg) +{ + struct cgroup *cg = tg_cgroup(tg); + struct cpuset *cs = cgroup_cs(cg); + + if (cs) + return (struct cpumask *)cs->cpus_allowed; + + return NULL; +} + +static void update_cpumask_for_group_balancer(struct cpuset *cs) +{ + struct cgroup *cg = cs->css.cgroup; + struct task_group *tg; + + if (!group_balancer_enabled()) + return; + + tg = cgroup_tg(cg); + if (!tg) + return; + if (!tg_group_balancer_enabled(tg)) + return; + + lock_cfs_constraints_mutex(); + tg_specs_change(tg); + unlock_cfs_constraints_mutex(); +} +#else +static inline void update_cpumask_for_group_balancer(struct cpuset *cs) { } +#endif + /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { @@ -1498,6 +1538,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* deleted = old - new = old & (~new) */ cpumask_andnot(&deleted, &old_cpus, tmp->new_cpus); cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL); + update_cpumask_for_group_balancer(cs); /* * On legacy hierarchy, if the effective cpumask of any non- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fefc9e372b4d..5820be54e496 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10138,6 +10138,16 @@ static int validate_group_balancer(struct task_group *tg) return retval; } +void lock_cfs_constraints_mutex(void) +{ + mutex_lock(&cfs_constraints_mutex); +} + +void unlock_cfs_constraints_mutex(void) +{ + mutex_unlock(&cfs_constraints_mutex); +} + static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 new) { diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index cbbe57b8aefe..fa1ffd3f6c18 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -8,6 +8,7 @@ #include "sched.h" #include #include +#include struct gb_lb_env { int src_cpu; @@ -302,6 +303,40 @@ static void add_to_size_level(struct group_balancer_sched_domain *gb_sd) __add_to_size_level(gb_sd, size_level); } +bool tg_group_balancer_enabled(struct task_group *tg) +{ + return tg->group_balancer; +} + +struct cgroup *tg_cgroup(struct task_group *tg) +{ + return tg->css.cgroup; +} + +#ifdef CONFIG_CPUSETS +static inline bool +gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + struct cpumask *cpus_allowed = task_group_cpus_allowed(tg); + struct cpumask soft_cpus_allowed; + unsigned int soft_cpus_weight; + + if (!cpus_allowed) { + soft_cpus_weight = gb_sd->span_weight; + } else { + cpumask_and(&soft_cpus_allowed, cpus_allowed, gb_sd_span(gb_sd)); + soft_cpus_weight = cpumask_weight(&soft_cpus_allowed); + } + return tg->specs_ratio <= 100 * soft_cpus_weight; +} +#else +static inline bool +gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + return true; +} +#endif + static int group_balancer_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -1402,8 +1437,9 @@ static unsigned long gb_sd_capacity(struct group_balancer_sched_domain *gb_sd) return cap; } -static struct group_balancer_sched_domain *select_idle_gb_sd(int specs) +static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *tg) { + int specs = tg->specs_ratio; struct group_balancer_sched_domain *gb_sd, *child; if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100) @@ -1418,7 +1454,7 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(int specs) int max_unsatisfied_free_specs = INT_MIN; for_each_gb_sd_child(child, gb_sd) { - if (child->span_weight * 100 >= specs && + if (gb_sd_satisfies_task_group(tg, child) && child->free_tg_specs > max_free_specs) { max_free_child = child; max_free_specs = child->free_tg_specs; @@ -1460,7 +1496,7 @@ check_task_group_leap_level(struct task_group *tg, struct group_balancer_sched_d int specs = tg->specs_ratio; for_each_gb_sd_child(child, gb_sd) { - if (specs <= 100 * child->span_weight) { + if (gb_sd_satisfies_task_group(tg, child)) { tg->leap_level = true; tg->leap_level_timestamp = jiffies; return; @@ -1553,7 +1589,7 @@ int attach_tg_to_group_balancer_sched_domain(struct task_group *tg, read_lock(&group_balancer_sched_domain_lock); if (enable) - gb_sd = select_idle_gb_sd(tg->specs_ratio); + gb_sd = select_idle_gb_sd(tg); else gb_sd = target; if (!gb_sd) { @@ -1617,6 +1653,9 @@ static bool tg_lower_level(struct task_group *tg) total_cap += child_cap; tg_child_load = tg_gb_sd_load(tg, child); + tg_load += tg_child_load; + if (!gb_sd_satisfies_task_group(tg, child)) + continue; if (!dst || tg_child_load > tg_dst_load) { dst = child; tg_dst_load = tg_child_load; @@ -1630,12 +1669,11 @@ static bool tg_lower_level(struct task_group *tg) dst_cap = child_cap; } } - tg_load += tg_child_load; } if (tg_load == 0) goto fail; - if (tg->specs_ratio > 100 * dst->span_weight) + if (!dst) goto fail; #ifdef CONFIG_NUMA /* We won't allow a task group span more than two numa nodes too long. */ @@ -1737,7 +1775,7 @@ void tg_specs_change(struct task_group *tg) return; /* This gb_sd still satisfy, don't do anything. */ - if (specs <= gb_sd->span_weight * 100 || gb_sd == group_balancer_root_domain) + if (gb_sd_satisfies_task_group(tg, gb_sd) || gb_sd == group_balancer_root_domain) return; /* The specs doesn't satisfy anymore, upper to find a satisfied gb_sd. */ @@ -1748,7 +1786,7 @@ void tg_specs_change(struct task_group *tg) } for (; gb_sd; gb_sd = gb_sd->parent) { - if (specs <= gb_sd->span_weight * 100) + if (gb_sd_satisfies_task_group(tg, gb_sd)) break; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 89cb253c8c9c..a1673af253ff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3713,7 +3713,6 @@ extern void sched_dynamic_update(int mode); #endif #ifdef CONFIG_GROUP_BALANCER -extern bool group_balancer_enabled(void); extern bool group_balancer_rq_enabled(struct rq *rq); static inline const struct cpumask *task_allowed_cpu(struct task_struct *p) { @@ -3738,11 +3737,6 @@ static inline void tg_inc_soft_cpus_version(struct task_group *tg) tg->soft_cpus_version = 0; } -static inline bool tg_group_balancer_enabled(struct task_group *tg) -{ - return tg->group_balancer; -} - extern void sched_init_group_balancer_sched_domains(void); extern void sched_clear_group_balancer_sched_domains(void); extern void tg_set_specs_ratio(struct task_group *tg); @@ -3751,7 +3745,6 @@ extern int attach_tg_to_group_balancer_sched_domain(struct task_group *tg, bool enable); extern void detach_tg_from_group_balancer_sched_domain(struct task_group *tg, bool disable); extern void update_group_balancer_root_cpumask(void); -extern void tg_specs_change(struct task_group *tg); extern unsigned long cfs_h_load(struct cfs_rq *cfs_rq); extern bool gb_cpu_overutilized(int cpu); extern void gb_load_balance(struct lb_env *env); @@ -3766,7 +3759,6 @@ static inline const struct cpumask *task_allowed_cpu(struct task_struct *p) } static inline void tg_set_specs_ratio(struct task_group *tg) { } static inline void update_group_balancer_root_cpumask(void) { } -static inline void tg_specs_change(struct task_group *tg) { } #ifdef CONFIG_SMP static inline void gb_load_balance(struct lb_env *env) { } #endif -- Gitee From abdff4ddc6c3bb03845a130a332433bb33afa4f7 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Tue, 11 Nov 2025 16:42:55 +0800 Subject: [PATCH 02/11] anolis: sched: support Group Balancer to be aware of cpu burst ANBZ: #8765 For burstable task groups, soft_cpus need to burst, too, by moving the task group to a higher level. So we introduce another rb_tree: burstable_task_groups to queue burstable task groups, and if the interval between load balances is too, short, we migrate burstable task groups only. Signed-off-by: CruzZhao --- kernel/sched/core.c | 1 + kernel/sched/group_balancer.c | 273 ++++++++++++++++++++++++++-------- kernel/sched/sched.h | 3 + 3 files changed, 217 insertions(+), 60 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5820be54e496..12a2cf71ee04 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9592,6 +9592,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); raw_spin_lock_irq(&cfs_b->lock); + tg_burst_change(tg, burst); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; cfs_b->burst = burst; diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index fa1ffd3f6c18..2ffb1e23cb65 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -19,6 +19,9 @@ struct gb_lb_env { unsigned long nr_balance_failed; enum migration_type migration_type; struct rb_root task_groups; +#ifdef CONFIG_CFS_BANDWIDTH + bool burst; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) @@ -43,6 +46,10 @@ struct group_balancer_sched_domain { unsigned int depth; raw_spinlock_t lock; struct rb_root task_groups; +#ifdef CONFIG_CFS_BANDWIDTH + struct rb_root burstable_task_groups; + atomic_t h_nr_burst_tg; +#endif struct kernfs_node *kn; unsigned long last_balance_timestamp; unsigned long lower_interval; @@ -337,6 +344,79 @@ gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_do } #endif +#ifdef CONFIG_CFS_BANDWIDTH +static inline bool is_burstable_task_group(struct task_group *tg) +{ + return !!tg->cfs_bandwidth.burst; +} + +static inline struct rb_root +*gb_rb_root(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + if (unlikely(is_burstable_task_group(tg))) + return &gb_sd->burstable_task_groups; + return &gb_sd->task_groups; +} +static inline void update_h_nr_burst_tg(struct task_group *tg, bool add) +{ + struct group_balancer_sched_domain *gb_sd = tg->gb_sd; + + if (!is_burstable_task_group(tg)) + return; + + for (; gb_sd; gb_sd = gb_sd->parent) { + if (add) + atomic_inc(&gb_sd->h_nr_burst_tg); + else + atomic_dec(&gb_sd->h_nr_burst_tg); + } +} + +static inline bool tg_specs_less(struct rb_node *a, const struct rb_node *b); +void tg_burst_change(struct task_group *tg, u64 burst) +{ + bool burst_before, burst_now; + struct group_balancer_sched_domain *gb_sd; + + if (!group_balancer_enabled()) + return; + if (!tg_group_balancer_enabled(tg)) + return; + + gb_sd = tg->gb_sd; + burst_before = !!tg->cfs_bandwidth.burst; + burst_now = !!burst; + if (burst_before == burst_now) + return; + + read_lock(&group_balancer_sched_domain_lock); + raw_spin_lock(&gb_sd->lock); + if (!burst_before) { + rb_erase(&tg->gb_node, &gb_sd->task_groups); + rb_add(&tg->gb_node, &gb_sd->burstable_task_groups, tg_specs_less); + update_h_nr_burst_tg(tg, true); + } else { + rb_erase(&tg->gb_node, &gb_sd->burstable_task_groups); + rb_add(&tg->gb_node, &gb_sd->task_groups, tg_specs_less); + update_h_nr_burst_tg(tg, false); + } + raw_spin_unlock(&gb_sd->lock); + read_unlock(&group_balancer_sched_domain_lock); +} +#else +static inline bool is_burstable_task_group(struct task_group *tg) +{ + return false; +} + +static inline rb_root *gb_rb_root(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + return &gb_sd->task_groups; +} + +static inline void update_h_nr_burst_tg(struct task_group *tg, bool add) { } +#endif + static int group_balancer_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -632,6 +712,7 @@ static inline struct group_balancer_sched_domain raw_spin_lock_init(&new->lock); new->task_groups = RB_ROOT; + new->burstable_task_groups = RB_ROOT; new->imbalance_pct = 117; return new; @@ -679,12 +760,10 @@ static void add_to_tree(struct group_balancer_sched_domain *gb_sd, } } -#define __node_2_task_group(n) rb_entry((n), struct task_group, gb_node) - static inline bool tg_specs_less(struct rb_node *a, const struct rb_node *b) { - struct task_group *tg_a = __node_2_task_group(a); - struct task_group *tg_b = __node_2_task_group(b); + struct task_group *tg_a = __gb_node_2_tg(a); + struct task_group *tg_b = __gb_node_2_tg(b); int specs_a = tg_a->specs_ratio; int specs_b = tg_b->specs_ratio; @@ -718,17 +797,31 @@ static void free_group_balancer_sched_domain(struct group_balancer_sched_domain struct task_group *tg; struct group_balancer_sched_domain *parent = gb_sd->parent; struct rb_node *node; - struct rb_root *root = &gb_sd->task_groups; + struct rb_root *roots[2] = { +#ifdef CONFIG_CFS_BANDWIDTH + &gb_sd->burstable_task_groups, +#else + NULL, +#endif + &gb_sd->task_groups, + }; + struct rb_root *root; + int i; if (parent) { parent->nr_children--; /* Move the task_groups to parent. */ - while (!RB_EMPTY_ROOT(root)) { - node = root->rb_node; - tg = __node_2_task_group(node); - rb_erase(node, root); - rb_add(node, &parent->task_groups, tg_specs_less); - walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg); + for (i = 0; i < 2; i++) { + root = roots[i]; + if (!root) + continue; + while (!RB_EMPTY_ROOT(root)) { + node = root->rb_node; + tg = __gb_node_2_tg(node); + rb_erase(node, root); + rb_add(node, &parent->task_groups, tg_specs_less); + walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg); + } } } @@ -1533,9 +1626,12 @@ void add_tg_to_group_balancer_sched_domain_locked(struct task_group *tg, struct group_balancer_sched_domain *gb_sd, bool enable) { - tg->gb_sd = gb_sd; - rb_add(&tg->gb_node, &gb_sd->task_groups, tg_specs_less); + struct rb_root *root; + tg->gb_sd = gb_sd; + root = gb_rb_root(tg, gb_sd); + rb_add(&tg->gb_node, root, tg_specs_less); + update_h_nr_burst_tg(tg, true); tg->soft_cpus_allowed_ptr = gb_sd_span(gb_sd); tg_inc_soft_cpus_version(tg); if (enable) @@ -1560,9 +1656,12 @@ remove_tg_from_group_balancer_sched_domain_locked(struct task_group *tg, struct group_balancer_sched_domain *gb_sd, bool disable) { - tg->gb_sd = NULL; - rb_erase(&tg->gb_node, &gb_sd->task_groups); + struct rb_root *root = gb_rb_root(tg, gb_sd); + + rb_erase(&tg->gb_node, root); RB_CLEAR_NODE(&tg->gb_node); + tg->gb_sd = NULL; + update_h_nr_burst_tg(tg, false); if (disable) walk_tg_tree_from(tg, tg_unset_gb_tg_down, tg_nop, NULL); } @@ -1846,49 +1945,75 @@ gb_detach_task_groups_from_gb_sd(struct gb_lb_env *gb_env, struct task_group *tg, *n; unsigned long load, util; int detached = 0; + struct rb_root *roots[2] = { +#ifdef CONFIG_CFS_BANDWIDTH + &gb_sd->burstable_task_groups, +#else + NULL, +#endif + &gb_sd->task_groups, + }; + int i, max_idx = 1; + struct rb_root *root; raw_spin_lock(&gb_sd->lock); - /* Try the task cgroups with little specs first. */ - gb_for_each_tg_safe(tg, n, &gb_sd->task_groups) { - if (!time_after(jiffies, tg->adjust_level_timestamp + 2 * gb_sd->lower_interval)) - continue; - switch (gb_env->migration_type) { -#ifdef CONFIG_GROUP_IDENTITY - case migrate_identity: - fallthrough; +#ifdef CONFIG_CFS_BANDWIDTH + /* + * When burst if true, the interval of load balance is too short, + * we migrate burst task groups only. + */ + if (gb_env->burst) + max_idx = 0; #endif - case migrate_load: - load = tg_gb_sd_load(tg, gb_sd); - if (load == 0) - continue; - if (shr_bound(load, gb_env->nr_balance_failed) > gb_env->imbalance) - continue; - gb_env->imbalance -= load; - break; - case migrate_util: - util = tg_gb_sd_util(tg, gb_sd); - if (util == 0) - continue; - if (shr_bound(util, gb_env->nr_balance_failed) > gb_env->imbalance) + for (i = 0; i <= max_idx; i++) { + root = roots[i]; + if (!root) + continue; + if (gb_env->burst && i == 1) + continue; + /* Try the task cgroups with little specs first. */ + gb_for_each_tg_safe(tg, n, root) { + if (i > 0 && !time_after(jiffies, + tg->adjust_level_timestamp + 2 * gb_sd->lower_interval)) continue; - gb_env->imbalance -= util; - break; - case migrate_task: - gb_env->imbalance = 0; - break; - /*TODO: Perfect strategy of migrate_misfit*/ - case migrate_misfit: - gb_env->imbalance = 0; - break; - default: - break; - } - remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false); - rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less); - detached++; - if (gb_env->imbalance <= 0) { - raw_spin_unlock(&gb_sd->lock); - return detached; + switch (gb_env->migration_type) { + #ifdef CONFIG_GROUP_IDENTITY + case migrate_identity: + fallthrough; + #endif + case migrate_load: + load = tg_gb_sd_load(tg, gb_sd); + if (load == 0) + continue; + if (shr_bound(load, gb_env->nr_balance_failed) > gb_env->imbalance) + continue; + gb_env->imbalance -= load; + break; + case migrate_util: + util = tg_gb_sd_util(tg, gb_sd); + if (util == 0) + continue; + if (shr_bound(util, gb_env->nr_balance_failed) > gb_env->imbalance) + continue; + gb_env->imbalance -= util; + break; + case migrate_task: + gb_env->imbalance = 0; + break; + /*TODO: Perfect strategy of migrate_misfit*/ + case migrate_misfit: + gb_env->imbalance = 0; + break; + default: + break; + } + remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false); + rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less); + detached++; + if (gb_env->imbalance <= 0) { + raw_spin_unlock(&gb_sd->lock); + return detached; + } } } raw_spin_unlock(&gb_sd->lock); @@ -1976,6 +2101,9 @@ void gb_load_balance(struct lb_env *env) int gb_sd_status = 0; struct cpumask *gb_mask = this_cpu_cpumask_var_ptr(group_balancer_mask); unsigned long src_load, src_cap, dst_load, dst_cap; +#ifdef CONFIG_CFS_BANDWIDTH + bool burst = false; +#endif if (!group_balancer_enabled()) return; @@ -1999,8 +2127,14 @@ void gb_load_balance(struct lb_env *env) if (!gb_sd) goto unlock; - if (!time_after(jiffies, gb_sd->last_balance_timestamp + 2 * gb_sd->lower_interval)) - goto unlock; + if (!time_after(jiffies, gb_sd->last_balance_timestamp + 2 * gb_sd->lower_interval)) { +#ifdef CONFIG_CFS_BANDWIDTH + if (atomic_read(&dst->h_nr_burst_tg)) + burst = true; + else +#endif + goto unlock; + } src_load = gb_sd_load(src); src_cap = gb_sd_capacity(src); @@ -2019,6 +2153,9 @@ void gb_load_balance(struct lb_env *env) .imbalance = env->imbalance, .nr_balance_failed = env->sd->nr_balance_failed, .task_groups = RB_ROOT, +#ifdef CONFIG_CFS_BANDWIDTH + .burst = burst, +#endif }; /* @@ -2026,10 +2163,26 @@ void gb_load_balance(struct lb_env *env) * and we don't migrate tg in this case. */ for (parent = gb_sd; parent; parent = parent->parent) { - for (node = rb_first(&parent->task_groups); node; node = rb_next(node)) { - tg = __node_2_task_group(node); - if (tg->cfs_rq[env->src_cpu]->h_nr_running) - goto unlock; + struct rb_root *roots[2] = { +#ifdef CONFIG_CFS_BANDWIDTH + &gb_sd->burstable_task_groups, +#else + NULL, +#endif + &gb_sd->task_groups, + }; + struct rb_root *root; + int i; + + for (i = 0; i < 2; i++) { + root = roots[i]; + if (!root) + continue; + for (node = rb_first(root); node; node = rb_next(node)) { + tg = __gb_node_2_tg(node); + if (tg->cfs_rq[env->src_cpu]->h_nr_running) + goto unlock; + } } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a1673af253ff..b7ed5e0359ed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3751,6 +3751,9 @@ extern void gb_load_balance(struct lb_env *env); extern void task_tick_gb(struct task_struct *p); extern void util_est_reenqueue_all(void); extern void util_est_clear_all(void); +#ifdef CONFIG_CFS_BANDWIDTH +extern void tg_burst_change(struct task_group *tg, u64 burst); +#endif #else static inline bool group_balancer_rq_enabled(struct rq *rq) { return false; } static inline const struct cpumask *task_allowed_cpu(struct task_struct *p) -- Gitee From 296fcb6cbbd11db0b1e8f44632e1f04f3b107e4d Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Tue, 11 Nov 2025 20:47:57 +0800 Subject: [PATCH 03/11] anolis: sched: record preferred gb_sd for task group ANBZ: #8765 If a group balancer sched domain just satisfies a task group, record it as preferred_gb_sd, and when the task group lower level after upper level, consider the preferred_gb_sd first. If the task group stays in the upper level too long, make the preferred_gb_sd expire. Signed-off-by: CruzZhao --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 2 ++ kernel/sched/group_balancer.c | 34 +++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 2 ++ kernel/sysctl.c | 9 +++++++++ 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 536765522aec..c4991a577807 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -131,6 +131,7 @@ extern int sched_acpu_enable_handler(struct ctl_table *table, int write, #endif #ifdef CONFIG_GROUP_BALANCER extern unsigned int sysctl_sched_group_balancer_enabled; +extern unsigned long sysctl_sched_gb_expiration_ms; extern int sched_group_balancer_enable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12a2cf71ee04..c06619b98f99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9013,6 +9013,8 @@ struct task_group *sched_create_group(struct task_group *parent) tg->group_balancer = 0; tg->soft_cpus_version = 0; tg->gb_sd = NULL; + tg->preferred_gb_sd = NULL; + tg->expiration_start = 0; raw_spin_lock_init(&tg->gb_lock); #endif return tg; diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 2ffb1e23cb65..d904ceb77669 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -276,6 +276,13 @@ struct group_balancer_sched_domain *group_balancer_root_domain; #define GB_OVERLOAD 0x1 #define GB_OVERUTILIZED 0x2 +/* + * The time threshold that the preferred gb_sd expires. + * Unit: ms + * Default: 6000000 + */ +unsigned long sysctl_sched_gb_expiration_ms = 60000; + static inline struct cpumask *gb_sd_span(struct group_balancer_sched_domain *gb_sd) { return to_cpumask(gb_sd->span); @@ -417,6 +424,17 @@ static inline rb_root *gb_rb_root(struct task_group *tg, struct group_balancer_s static inline void update_h_nr_burst_tg(struct task_group *tg, bool add) { } #endif +static inline bool +is_preferred_gb_sd(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + struct group_balancer_sched_domain *p_gb_sd = tg->preferred_gb_sd; + + if (!p_gb_sd) + return true; + + return cpumask_subset(gb_sd_span(p_gb_sd), gb_sd_span(gb_sd)); +} + static int group_balancer_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -1596,6 +1614,7 @@ check_task_group_leap_level(struct task_group *tg, struct group_balancer_sched_d } } + tg->preferred_gb_sd = gb_sd; tg->leap_level = false; } @@ -1774,6 +1793,16 @@ static bool tg_lower_level(struct task_group *tg) goto fail; if (!dst) goto fail; + if (!is_preferred_gb_sd(tg, gb_sd)) { + /* + * If the task group stays in the upper level for too long, + * make the preferred gb sd to expire. + */ + if (!time_after(jiffies, + tg->expiration_start + msecs_to_jiffies(sysctl_sched_gb_expiration_ms))) + goto fail; + tg->preferred_gb_sd = NULL; + } #ifdef CONFIG_NUMA /* We won't allow a task group span more than two numa nodes too long. */ if (dst->gb_flags & GROUP_BALANCER_NUMA_FLAG) @@ -1870,8 +1899,10 @@ void tg_specs_change(struct task_group *tg) /* If the task group leaps level after specs change, we will lower it later. */ check_task_group_leap_level(tg, gb_sd); - if (tg->leap_level) + if (tg->leap_level) { + tg->preferred_gb_sd = NULL; return; + } /* This gb_sd still satisfy, don't do anything. */ if (gb_sd_satisfies_task_group(tg, gb_sd) || gb_sd == group_balancer_root_domain) @@ -2008,6 +2039,7 @@ gb_detach_task_groups_from_gb_sd(struct gb_lb_env *gb_env, break; } remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false); + tg->expiration_start = jiffies; rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less); detached++; if (gb_env->imbalance <= 0) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b7ed5e0359ed..c064862c3a8e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -618,11 +618,13 @@ struct task_group { int specs_ratio; struct rb_node gb_node; struct group_balancer_sched_domain *gb_sd; + struct group_balancer_sched_domain *preferred_gb_sd; struct task_group *gb_tg; bool group_balancer; bool leap_level; unsigned long leap_level_timestamp; unsigned long adjust_level_timestamp; + unsigned long expiration_start; raw_spinlock_t gb_lock; #endif long priority; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff72c63f6129..fc0da990ae37 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2116,6 +2116,15 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "sched_gb_expiration_ms", + .data = &sysctl_sched_gb_expiration_ms, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, + }, #endif #ifdef CONFIG_PROVE_LOCKING { -- Gitee From 0548109dfbb868c9ac2fa78f7054e38ce292d98f Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Tue, 11 Nov 2025 20:50:58 +0800 Subject: [PATCH 04/11] anolis: sched: force to lower level if a taskgroup spans LLC ANBZ: #8765 Force to lower level if a taskgroup spans LLC instead of NUMA. Signed-off-by: CruzZhao --- kernel/sched/group_balancer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index d904ceb77669..44225b259e83 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -1803,11 +1803,11 @@ static bool tg_lower_level(struct task_group *tg) goto fail; tg->preferred_gb_sd = NULL; } -#ifdef CONFIG_NUMA + /* We won't allow a task group span more than two numa nodes too long. */ - if (dst->gb_flags & GROUP_BALANCER_NUMA_FLAG) + if (dst->gb_flags & GROUP_BALANCER_LLC_FLAG) goto lower; -#endif + /* If we lower the level, we have to make sure that we will not cause imbalance. * * src_load dst_load -- Gitee From a928c3f8c638c51334d651fad2328ea6e4a24fa0 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Mon, 10 Nov 2025 11:23:32 +0800 Subject: [PATCH 05/11] anolis: sched: introduce rq->nr_gb_running ANBZ: #8765 Introduce rq->nr_gb_running to indicate how many tasks in rq are scheduled by group balancer. Signed-off-by: CruzZhao --- kernel/sched/core.c | 45 ++----------------------- kernel/sched/fair.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 16 +++++++++ 3 files changed, 99 insertions(+), 42 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06619b98f99..6dbe01ba4c4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10109,38 +10109,6 @@ static u64 cpu_group_balancer_read_u64(struct cgroup_subsys_state *css, return tg->group_balancer; } -static int tg_validate_group_balancer_down(struct task_group *tg, void *data) -{ - if (tg->group_balancer) - return -EINVAL; - return 0; -} - -/* - * There is only one task group allowed to enable group balancer in the path from - * root_task_group to a certion leaf task group. - */ -static int validate_group_balancer(struct task_group *tg) -{ - int retval = 0; - - rcu_read_lock(); - retval = walk_tg_tree_from(tg, tg_validate_group_balancer_down, - tg_nop, NULL); - if (retval) - goto out; - - for (; tg != &root_task_group; tg = tg->parent) { - if (tg->group_balancer) { - retval = -EINVAL; - break; - } - } -out: - rcu_read_unlock(); - return retval; -} - void lock_cfs_constraints_mutex(void) { mutex_lock(&cfs_constraints_mutex); @@ -10174,16 +10142,9 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css, if (old == new) goto out; - if (new) { - retval = validate_group_balancer(tg); - if (retval) - goto out; - retval = attach_tg_to_group_balancer_sched_domain(tg, NULL, true); - if (retval) - goto out; - } else { - detach_tg_from_group_balancer_sched_domain(tg, true); - } + retval = update_group_balancer(tg, new); + if (retval) + goto out; tg->group_balancer = new; out: raw_spin_unlock(&tg->gb_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ca96537f9e46..a18bdc61b105 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8571,6 +8571,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); id_update_nr_running(task_group(p), p, rq, 1); + gb_update_nr_running(task_group(p), rq, 1); /* * Since new tasks are assigned an initial util_avg equal to @@ -8697,6 +8698,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); id_update_nr_running(task_group(p), p, rq, -1); + gb_update_nr_running(task_group(p), rq, -1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -15302,3 +15304,81 @@ int sched_trace_rq_nr_running(struct rq *rq) return rq ? rq->nr_running : -1; } EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); + +#ifdef CONFIG_GROUP_BALANCER +static int tg_validate_group_balancer_down(struct task_group *tg, void *data) +{ + if (tg->group_balancer) + return -EINVAL; + return 0; +} + +/* + * There is only one task group allowed to enable group balancer in the path from + * root_task_group to a certion leaf task group. + */ +static int validate_group_balancer(struct task_group *tg) +{ + int retval = 0; + + rcu_read_lock(); + retval = walk_tg_tree_from(tg, tg_validate_group_balancer_down, + tg_nop, NULL); + if (retval) + goto out; + + for (; tg != &root_task_group; tg = tg->parent) { + if (tg->group_balancer) { + retval = -EINVAL; + break; + } + } +out: + rcu_read_unlock(); + return retval; +} + +int update_group_balancer(struct task_group *tg, u64 new) +{ + int cpu, retval; + struct rq_flags rf; + unsigned int delta; + + if (new) { + retval = validate_group_balancer(tg); + if (retval) + return retval; + retval = attach_tg_to_group_balancer_sched_domain(tg, NULL, true); + if (retval) + return retval; + } else { + detach_tg_from_group_balancer_sched_domain(tg, true); + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + bool on_rq, throttled; + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + struct sched_entity *se; + + rq_lock_irq(rq, &rf); + se = tg->se[cpu]; + cfs_rq = cfs_rq_of(se); + throttled = throttled_hierarchy(cfs_rq); + delta = se->my_q->h_nr_running; + on_rq = se->on_rq; + + if (on_rq && !throttled) { + if (new) + rq->nr_gb_running += delta; + else + rq->nr_gb_running -= delta; + } + rq_unlock_irq(rq, &rf); + } + cpus_read_unlock(); + + return 0; +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c064862c3a8e..4193bfd0a92a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1549,6 +1549,8 @@ struct rq { #ifdef CONFIG_GROUP_BALANCER struct group_balancer_sched_domain *gb_sd; + unsigned int nr_gb_running; + long nr_gb_make_up; bool group_balancer_enabled; #endif @@ -2967,6 +2969,20 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } +#ifdef CONFIG_GROUP_BALANCER +static inline void gb_update_nr_running(struct task_group *tg, struct rq *rq, int delta) +{ + if (!group_balancer_enabled()) + return; + if (!tg || !tg_group_balancer_enabled(tg)) + return; + rq->nr_gb_running += delta; +} +extern int update_group_balancer(struct task_group *tg, u64 new); +#else +static inline void gb_update_nr_running(struct task_group *tg, struct rq *rq, int delta) { } +#endif + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -- Gitee From d0394a49e345ebfa063bce4a1b91b997d0a7face Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Wed, 12 Nov 2025 16:18:23 +0800 Subject: [PATCH 06/11] anolis: sched: correct free specs account ANBZ: #8765 When the quota of the task group changes, update the free specs of gb_sd. Change the type free_tg_specs into atomic_t to reduce spin lock competition. When select a idle gb_sd for a task group, hold a mutex lock to avoid competition with other task groups. Fixes: 7f8e0c71335f ("anolis: sched: maintain group balancer task groups") Signed-off-by: CruzZhao --- include/linux/sched.h | 3 ++- kernel/cgroup/cpuset.c | 2 +- kernel/sched/fair.c | 20 ++++++++------ kernel/sched/group_balancer.c | 51 +++++++++++++++++++++-------------- 4 files changed, 46 insertions(+), 30 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 89913709a766..48fdda47b88d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2427,7 +2427,8 @@ static inline bool jbd2_proxy_exec_disabled(void) } #ifdef CONFIG_GROUP_BALANCER extern bool group_balancer_enabled(void); -extern void tg_specs_change(struct task_group *tg); +extern int get_tg_specs(struct task_group *tg); +extern void tg_specs_change(struct task_group *tg, u64 specs_before); extern bool tg_group_balancer_enabled(struct task_group *tg); extern struct task_group *cgroup_tg(struct cgroup *cgrp); extern struct cgroup *tg_cgroup(struct task_group *tg); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bb9da7a87371..d48296132e71 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -238,7 +238,7 @@ static void update_cpumask_for_group_balancer(struct cpuset *cs) return; lock_cfs_constraints_mutex(); - tg_specs_change(tg); + tg_specs_change(tg, get_tg_specs(tg)); unlock_cfs_constraints_mutex(); } #else diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a18bdc61b105..605af34adad7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -14734,19 +14734,18 @@ void free_fair_sched_group(struct task_group *tg) void tg_set_specs_ratio(struct task_group *tg) { u64 quota = tg_cfs_bandwidth(tg)->hierarchical_quota; - u64 specs_ratio; + u64 specs_ratio, specs_before; + specs_before = tg->specs_ratio; if (quota == RUNTIME_INF) { tg->specs_ratio = -1; - return; + } else { + specs_ratio = quota / ((1 << BW_SHIFT) / 100); + /* If specs_ratio is bigger than INT_MAX, set specs_ratio -1. */ + tg->specs_ratio = specs_ratio > INT_MAX ? -1 : specs_ratio; } - - specs_ratio = quota / ((1 << BW_SHIFT) / 100); - - /* If specs_ratio is bigger than INT_MAX, set specs_ratio -1. */ - tg->specs_ratio = specs_ratio > INT_MAX ? -1 : specs_ratio; if (tg->group_balancer) - tg_specs_change(tg); + tg_specs_change(tg, specs_before); } #endif @@ -15381,4 +15380,9 @@ int update_group_balancer(struct task_group *tg, u64 new) return 0; } + +int get_tg_specs(struct task_group *tg) +{ + return tg->specs_ratio; +} #endif diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 44225b259e83..6f536172152c 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -42,7 +42,7 @@ struct group_balancer_sched_domain { unsigned int span_weight; unsigned int nr_children; /* If free_tg_specs is less than zero, the gb_sd is overloaded. */ - int free_tg_specs; + atomic_t free_tg_specs; unsigned int depth; raw_spinlock_t lock; struct rb_root task_groups; @@ -153,6 +153,7 @@ struct group_balancer_size_level { LIST_HEAD(group_balancer_sched_domains); DEFINE_RWLOCK(group_balancer_sched_domain_lock); +DEFINE_MUTEX(group_balancer_select_lock); struct cpumask root_cpumask; @@ -767,7 +768,7 @@ static void add_to_tree(struct group_balancer_sched_domain *gb_sd, } gb_sd->span_weight = cpumask_weight(gb_sd_span(gb_sd)); gb_sd->lower_interval = ilog2(gb_sd->span_weight) * gb_sd->span_weight; - gb_sd->free_tg_specs = 100 * gb_sd->span_weight; + atomic_set(&gb_sd->free_tg_specs, 100 * gb_sd->span_weight); add_to_size_level(gb_sd); if (!gb_sd->nr_children) { @@ -1176,8 +1177,8 @@ static int build_group_balancer_sched_domains(void) group_balancer_root_domain->lower_interval = ilog2(group_balancer_root_domain->span_weight) * group_balancer_root_domain->span_weight; - group_balancer_root_domain->free_tg_specs = - 100 * group_balancer_root_domain->span_weight; + atomic_set(&group_balancer_root_domain->free_tg_specs, + 100 * group_balancer_root_domain->span_weight); } if (!zalloc_cpumask_var(&trial_cpumask, GFP_KERNEL)) { @@ -1565,14 +1566,16 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group * int max_unsatisfied_free_specs = INT_MIN; for_each_gb_sd_child(child, gb_sd) { + int free_tg_specs = atomic_read(&child->free_tg_specs); + if (gb_sd_satisfies_task_group(tg, child) && - child->free_tg_specs > max_free_specs) { + free_tg_specs > max_free_specs) { max_free_child = child; - max_free_specs = child->free_tg_specs; + max_free_specs = free_tg_specs; } else if (child->span_weight * 100 < specs && - child->free_tg_specs > max_unsatisfied_free_specs) { + free_tg_specs > max_unsatisfied_free_specs) { max_unsatisfied_free_child = child; - max_unsatisfied_free_specs = child->free_tg_specs; + max_unsatisfied_free_specs = free_tg_specs; } } if (!max_free_child) @@ -1622,13 +1625,8 @@ void update_free_tg_specs(struct group_balancer_sched_domain *gb_sd, int specs) { struct group_balancer_sched_domain *parent; - if (specs != -1) { - for (parent = gb_sd; parent; parent = parent->parent) { - raw_spin_lock(&parent->lock); - parent->free_tg_specs += specs; - raw_spin_unlock(&parent->lock); - } - } + for (parent = gb_sd; parent; parent = parent->parent) + atomic_add(specs, &parent->free_tg_specs); } /* @@ -1658,6 +1656,8 @@ void add_tg_to_group_balancer_sched_domain_locked(struct task_group *tg, check_task_group_leap_level(tg, gb_sd); tg->adjust_level_timestamp = jiffies; + if (tg->specs_ratio != -1) + update_free_tg_specs(gb_sd, -tg->specs_ratio); } void add_tg_to_group_balancer_sched_domain(struct task_group *tg, @@ -1667,7 +1667,6 @@ void add_tg_to_group_balancer_sched_domain(struct task_group *tg, raw_spin_lock(&gb_sd->lock); add_tg_to_group_balancer_sched_domain_locked(tg, gb_sd, enable); raw_spin_unlock(&gb_sd->lock); - update_free_tg_specs(gb_sd, -tg->specs_ratio); } static void @@ -1683,6 +1682,8 @@ remove_tg_from_group_balancer_sched_domain_locked(struct task_group *tg, update_h_nr_burst_tg(tg, false); if (disable) walk_tg_tree_from(tg, tg_unset_gb_tg_down, tg_nop, NULL); + if (tg->specs_ratio != -1) + update_free_tg_specs(gb_sd, tg->specs_ratio); } static void @@ -1694,7 +1695,6 @@ remove_tg_from_group_balancer_sched_domain(struct task_group *tg, raw_spin_lock(&gb_sd->lock); remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, disable); raw_spin_unlock(&gb_sd->lock); - update_free_tg_specs(gb_sd, tg->specs_ratio); read_unlock(&group_balancer_sched_domain_lock); } @@ -1706,16 +1706,20 @@ int attach_tg_to_group_balancer_sched_domain(struct task_group *tg, int ret = 0; read_lock(&group_balancer_sched_domain_lock); - if (enable) + if (enable) { + mutex_lock(&group_balancer_select_lock); gb_sd = select_idle_gb_sd(tg); - else + } else { gb_sd = target; + } if (!gb_sd) { ret = -ESRCH; goto out; } add_tg_to_group_balancer_sched_domain(tg, gb_sd, enable); out: + if (enable) + mutex_unlock(&group_balancer_select_lock); read_unlock(&group_balancer_sched_domain_lock); return ret; } @@ -1887,7 +1891,7 @@ void task_tick_gb(struct task_struct *p) raw_spin_unlock(&tg->gb_lock); } -void tg_specs_change(struct task_group *tg) +void tg_specs_change(struct task_group *tg, u64 specs_before) { struct group_balancer_sched_domain *gb_sd; int specs = tg->specs_ratio; @@ -1897,6 +1901,13 @@ void tg_specs_change(struct task_group *tg) /* tg->group_balancer is always true here, so find a gb_sd to attach. */ goto upper; + if (specs_before != specs) { + if (specs_before != -1) + update_free_tg_specs(gb_sd, specs_before); + if (specs != -1) + update_free_tg_specs(gb_sd, -specs); + } + /* If the task group leaps level after specs change, we will lower it later. */ check_task_group_leap_level(tg, gb_sd); if (tg->leap_level) { -- Gitee From 77f966fcfc58900165d9fa5960c5236b5b760413 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Thu, 13 Nov 2025 11:06:45 +0800 Subject: [PATCH 07/11] anolis: sched: fix some bugs of group balancer ANBZ: #8765 When reference max_unsatisfied_free_child, check whether it's NULL. Init dst as NULL in tg_lower_level() to avoid invalid pointer reference. Check whether the dst is leap level instead of parent in tg_lower_level(). Fixes: 7f8e0c71335f ("anolis: sched: maintain group balancer task groups") Fixes: bebcfc550d82 ("anolis: sched: introduce dynamical load balance for group balancer") Signed-off-by: CruzZhao --- kernel/sched/group_balancer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 6f536172152c..37dc25a73261 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -1593,9 +1593,9 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group * * specs cannot fully represent the degree of idleness if the span weight is * different. */ - if (max_free_specs < specs && + if (max_free_specs < specs && (!max_unsatisfied_free_child || max_free_specs / max_free_child->span_weight < - max_unsatisfied_free_specs / max_unsatisfied_free_child->span_weight) + max_unsatisfied_free_specs / max_unsatisfied_free_child->span_weight)) break; gb_sd = max_free_child; } @@ -1744,7 +1744,7 @@ static void tg_upper_level(struct task_group *tg, struct group_balancer_sched_do static bool tg_lower_level(struct task_group *tg) { struct group_balancer_sched_domain *gb_sd = tg->gb_sd; - struct group_balancer_sched_domain *child, *dst; + struct group_balancer_sched_domain *child, *dst = NULL; unsigned long tg_child_load, tg_load = 0, tg_dst_load = 0; unsigned long child_load, src_load, dst_load, total_load = 0, migrate_load; unsigned long child_cap, total_cap = 0, src_cap, dst_cap = 0; @@ -1838,7 +1838,7 @@ static bool tg_lower_level(struct task_group *tg) detach_tg_from_group_balancer_sched_domain(tg, false); attach_tg_to_group_balancer_sched_domain(tg, dst, false); /* The task group maybe still leap level, check it. */ - check_task_group_leap_level(tg, gb_sd); + check_task_group_leap_level(tg, dst); return true; fail: -- Gitee From 9be57cebd63e21190564e69e0e2b638cd519a72f Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Thu, 13 Nov 2025 18:48:34 +0800 Subject: [PATCH 08/11] anolis: sched: optimize load balance with nr_gb_running ANBZ: #8765 When judge whether there is overload in a group balancer sched domain, we'd better compare the sum of rq->nr_gb_running with gb_sd->span_weight, where rq->nr_gb_running indicates the running tasks in group balancer task groups. If the lower of the task group won't cause overload, just lower it. Signed-off-by: CruzZhao --- kernel/sched/group_balancer.c | 64 ++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 37dc25a73261..37033980c716 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -1549,6 +1549,32 @@ static unsigned long gb_sd_capacity(struct group_balancer_sched_domain *gb_sd) return cap; } +static unsigned int gb_sd_nr_running(struct group_balancer_sched_domain *gb_sd) +{ + int cpu; + int nr_running = 0; + + for_each_cpu(cpu, gb_sd_span(gb_sd)) + nr_running += cpu_rq(cpu)->nr_gb_running; + + return nr_running; +} + +static unsigned int +tg_gb_sd_nr_running(struct task_group *tg, struct group_balancer_sched_domain *gb_sd) +{ + int cpu; + int nr_running = 0; + struct cfs_rq *cfs_rq; + + for_each_cpu(cpu, gb_sd_span(gb_sd)) { + cfs_rq = tg->cfs_rq[cpu]; + nr_running += cfs_rq->h_nr_running; + } + + return nr_running; +} + static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *tg) { int specs = tg->specs_ratio; @@ -1748,6 +1774,8 @@ static bool tg_lower_level(struct task_group *tg) unsigned long tg_child_load, tg_load = 0, tg_dst_load = 0; unsigned long child_load, src_load, dst_load, total_load = 0, migrate_load; unsigned long child_cap, total_cap = 0, src_cap, dst_cap = 0; + unsigned int child_nr_running, dst_nr_running = 0, tg_child_nr_running; + unsigned int tg_nr_running = 0, tg_dst_nr_running = 0, migrate_nr_running; unsigned long src_imb, dst_imb; if (!gb_sd) @@ -1770,12 +1798,15 @@ static bool tg_lower_level(struct task_group *tg) for_each_gb_sd_child(child, gb_sd) { child_load = gb_sd_load(child); total_load += child_load; + child_nr_running = gb_sd_nr_running(child); child_cap = gb_sd_capacity(child); total_cap += child_cap; tg_child_load = tg_gb_sd_load(tg, child); tg_load += tg_child_load; + tg_child_nr_running = tg_gb_sd_nr_running(tg, child); + tg_nr_running += tg_child_nr_running; if (!gb_sd_satisfies_task_group(tg, child)) continue; if (!dst || tg_child_load > tg_dst_load) { @@ -1783,12 +1814,16 @@ static bool tg_lower_level(struct task_group *tg) tg_dst_load = tg_child_load; dst_load = child_load; dst_cap = child_cap; + tg_dst_nr_running = tg_child_nr_running; + dst_nr_running = child_nr_running; } else if (tg_child_load == tg_dst_load) { if (dst_load * child_cap > child_load * dst_cap) { dst = child; tg_dst_load = tg_child_load; dst_load = child_load; dst_cap = child_cap; + tg_dst_nr_running = tg_child_nr_running; + dst_nr_running = child_nr_running; } } } @@ -1812,6 +1847,11 @@ static bool tg_lower_level(struct task_group *tg) if (dst->gb_flags & GROUP_BALANCER_LLC_FLAG) goto lower; + /* If migration won't cause overload, do migrate.*/ + migrate_nr_running = tg_nr_running - tg_dst_nr_running; + if (dst_nr_running + migrate_nr_running <= dst->span_weight) + goto lower; + /* If we lower the level, we have to make sure that we will not cause imbalance. * * src_load dst_load @@ -2112,18 +2152,18 @@ static void gb_attach_task_groups(struct gb_lb_env *gb_env) static void __update_gb_sd_status(struct group_balancer_sched_domain *gb_sd, int *gb_sd_status) { - int i, nr_running; + int i, nr_gb_running = 0; for_each_cpu(i, gb_sd_span(gb_sd)) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - if (nr_running > 1) - *gb_sd_status |= GB_OVERLOAD; - - if (gb_cpu_overutilized(i)) - *gb_sd_status |= GB_OVERUTILIZED; + nr_gb_running += rq->nr_gb_running; + /* TODO: Improve the utilization of GB_OVERUTILIZED.*/ +// if (gb_cpu_overutilized(i)) +// *gb_sd_status |= GB_OVERUTILIZED; } + if (nr_gb_running > gb_sd->span_weight) + *gb_sd_status |= GB_OVERLOAD; } static void update_gb_sd_status(struct gb_lb_env *gb_env, int *gb_sd_status) @@ -2147,6 +2187,7 @@ void gb_load_balance(struct lb_env *env) #ifdef CONFIG_CFS_BANDWIDTH bool burst = false; #endif + int src_status = 0; if (!group_balancer_enabled()) return; @@ -2178,13 +2219,20 @@ void gb_load_balance(struct lb_env *env) #endif goto unlock; } + gb_sd->last_balance_timestamp = jiffies; src_load = gb_sd_load(src); src_cap = gb_sd_capacity(src); dst_load = gb_sd_load(dst); dst_cap = gb_sd_capacity(dst); + __update_gb_sd_status(src, &src_status); - if (dst_load * src_cap * gb_sd->imbalance_pct >= src_load * dst_cap * 100) + /* + * If the imbalance isn't larger than imbalance_pct, and it isn't the case that + * dst is idle and src is overload, don't do balance. + */ + if (dst_load * src_cap * gb_sd->imbalance_pct >= src_load * dst_cap * 100 && + !(available_idle_cpu(env->dst_cpu) && src_status)) goto unlock; gb_env = (struct gb_lb_env){ -- Gitee From 3378cc1120851c585c5ba15010692cdcc0f8db89 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Thu, 13 Nov 2025 23:22:35 +0800 Subject: [PATCH 09/11] anolis: sched: support cpu.group_balancer to be set to 2 ANBZ: #8765 Setting cpu.group_balancer to 2 means that tg aquires double logical cpus. Signed-off-by: CruzZhao --- kernel/sched/core.c | 10 +++++++++- kernel/sched/group_balancer.c | 8 +++++--- kernel/sched/sched.h | 2 +- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6dbe01ba4c4e..68c95c75c8c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10132,7 +10132,7 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css, if (tg == &root_task_group || task_group_is_autogroup(tg)) return -EACCES; - if (new > 1) + if (new > 2) return -EINVAL; write_lock(&group_balancer_lock); @@ -10142,6 +10142,14 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css, if (old == new) goto out; + if (!!old == !!new) { + mutex_lock(&cfs_constraints_mutex); + tg_specs_change(tg, tg->specs_ratio); + mutex_unlock(&cfs_constraints_mutex); + tg->group_balancer = new; + goto out; + } + retval = update_group_balancer(tg, new); if (retval) goto out; diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 37033980c716..d200284432bc 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -320,7 +320,7 @@ static void add_to_size_level(struct group_balancer_sched_domain *gb_sd) bool tg_group_balancer_enabled(struct task_group *tg) { - return tg->group_balancer; + return !!tg->group_balancer; } struct cgroup *tg_cgroup(struct task_group *tg) @@ -342,7 +342,8 @@ gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_do cpumask_and(&soft_cpus_allowed, cpus_allowed, gb_sd_span(gb_sd)); soft_cpus_weight = cpumask_weight(&soft_cpus_allowed); } - return tg->specs_ratio <= 100 * soft_cpus_weight; + /* tg->group_balancer = 2 means that tg aquires double logical cpus. */ + return tg->group_balancer * tg->specs_ratio <= 100 * soft_cpus_weight; } #else static inline bool @@ -1961,7 +1962,8 @@ void tg_specs_change(struct task_group *tg, u64 specs_before) /* The specs doesn't satisfy anymore, upper to find a satisfied gb_sd. */ /* Fast path, if the specs is -1 or too large, move it to root domain. */ - if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100) { + if (specs == -1 || + tg->group_balancer * specs > group_balancer_root_domain->span_weight * 100) { gb_sd = group_balancer_root_domain; goto upper; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4193bfd0a92a..6b21ea05b630 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -620,7 +620,7 @@ struct task_group { struct group_balancer_sched_domain *gb_sd; struct group_balancer_sched_domain *preferred_gb_sd; struct task_group *gb_tg; - bool group_balancer; + unsigned int group_balancer; bool leap_level; unsigned long leap_level_timestamp; unsigned long adjust_level_timestamp; -- Gitee From 1891410aa4877ad658ce17bde91b075714e107c3 Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Fri, 14 Nov 2025 10:23:55 +0800 Subject: [PATCH 10/11] anolis: sched: consider about specs balance for group balancer ANBZ: #8765 When lower the level of a task group, consider whether it will cause more imbalance between src free specs and dst free specs, to avoid imbalance. However, consider the case following: some task groups have large specs but have low load, and in this case the specs loses its reference value. So we introduce a sched feat to control whether we consider about specs balance: GB_SPECS_BALANCE. Signed-off-by: CruzZhao --- kernel/sched/features.h | 4 ++++ kernel/sched/group_balancer.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 8f30a023365f..d45ae1e86d16 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -119,6 +119,10 @@ SCHED_FEAT(SCHED_CORE_HT_AWARE_QUOTA, false) SCHED_FEAT(SCHED_CORE_VRUNTIME, false) #endif +#ifdef CONFIG_GROUP_BALANCER +SCHED_FEAT(GB_SPECS_BALANCE, false) +#endif + SCHED_FEAT(SCHED_FEAT_RESERVE1, false) SCHED_FEAT(SCHED_FEAT_RESERVE2, false) SCHED_FEAT(SCHED_FEAT_RESERVE3, false) diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index d200284432bc..1a4a02f98991 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -1778,6 +1778,9 @@ static bool tg_lower_level(struct task_group *tg) unsigned int child_nr_running, dst_nr_running = 0, tg_child_nr_running; unsigned int tg_nr_running = 0, tg_dst_nr_running = 0, migrate_nr_running; unsigned long src_imb, dst_imb; + int total_free_specs = 0, child_free_specs = 0, dst_free_specs = 0, src_free_specs = 0; + int tg_specs; + unsigned int src_span_weight, dst_span_weight; if (!gb_sd) goto fail; @@ -1808,6 +1811,9 @@ static bool tg_lower_level(struct task_group *tg) tg_load += tg_child_load; tg_child_nr_running = tg_gb_sd_nr_running(tg, child); tg_nr_running += tg_child_nr_running; + + child_free_specs = atomic_read(&child->free_tg_specs); + total_free_specs += child_free_specs; if (!gb_sd_satisfies_task_group(tg, child)) continue; if (!dst || tg_child_load > tg_dst_load) { @@ -1817,6 +1823,7 @@ static bool tg_lower_level(struct task_group *tg) dst_cap = child_cap; tg_dst_nr_running = tg_child_nr_running; dst_nr_running = child_nr_running; + dst_free_specs = child_free_specs; } else if (tg_child_load == tg_dst_load) { if (dst_load * child_cap > child_load * dst_cap) { dst = child; @@ -1825,6 +1832,7 @@ static bool tg_lower_level(struct task_group *tg) dst_cap = child_cap; tg_dst_nr_running = tg_child_nr_running; dst_nr_running = child_nr_running; + dst_free_specs = child_free_specs; } } } @@ -1873,6 +1881,31 @@ static bool tg_lower_level(struct task_group *tg) if (dst_imb > src_imb) goto fail; + + if (!sched_feat(GB_SPECS_BALANCE)) + goto lower; + /* + * If we lower the level, we'd better guarantee that free specs won't be more imbalance. + * + * src_free_specs dst_free_specs + * --------------- vs -------------- + * src_span_weight dst_span_weight + * + */ + tg_specs = tg->specs_ratio; + src_free_specs = total_free_specs - dst_free_specs; + dst_span_weight = dst->span_weight; + src_span_weight = gb_sd->span_weight - dst_span_weight; + src_imb = abs(src_free_specs * dst_span_weight - dst_free_specs * src_span_weight); + dst_imb = abs(src_free_specs * dst_span_weight - + (dst_free_specs - tg_specs) * src_span_weight); + + if (dst_free_specs * src_span_weight > src_free_specs * dst_span_weight) + goto fail; + + if (dst_imb > src_imb) + goto fail; + #ifdef CONFIG_NUMA lower: #endif -- Gitee From cdc1ec5fdd9300a7c8d3888b63aa72a168c6b5ce Mon Sep 17 00:00:00 2001 From: CruzZhao Date: Fri, 14 Nov 2025 14:37:16 +0800 Subject: [PATCH 11/11] anolis: sched: try to find idle cpu in preferred_gb_sd first ANBZ: #8765 When select idle cpu, try to find an idle cpu in preferred_gb_sd first, if no, try to find one in llc. Signed-off-by: CruzZhao --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++++-- kernel/sched/group_balancer.c | 5 +++++ kernel/sched/sched.h | 1 + 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 605af34adad7..a3fdcca1bd57 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9242,12 +9242,27 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co struct sched_domain *this_sd; u64 time; bool is_seeker; +#ifdef CONFIG_GROUP_BALANCER + struct task_group *tg = task_group(p); + bool gb_tried = false; + struct group_balancer_sched_domain *preferred = tg->preferred_gb_sd; +#endif this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; +#ifdef CONFIG_GROUP_BALANCER +retry: + if (group_balancer_enabled() && !gb_tried && tg_group_balancer_enabled(tg) && preferred) { + cpumask_and(cpus, get_gb_sd_span(preferred), task_allowed_cpu(p)); + } else { + gb_tried = true; + cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p)); + } +#else cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p)); +#endif if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -9284,7 +9299,7 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co return i; } else { if (--nr <= 0) - return -1; + goto out; idle_cpu = __select_idle_cpu(cpu, p, &id_backup); if ((unsigned int)idle_cpu < nr_cpumask_bits) return idle_cpu; @@ -9301,13 +9316,20 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co return i; } else { if (--nr <= 0) - return -1; + goto out; idle_cpu = __select_idle_cpu(cpu, p, &id_backup); if ((unsigned int)idle_cpu < nr_cpumask_bits) break; } } +#ifdef CONFIG_GROUP_BALANCER + if (!gb_tried) { + gb_tried = true; + goto retry; + } +#endif + if (has_idle_core) set_idle_cores(target, false); @@ -9319,6 +9341,14 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co if (!group_identity_disabled()) return (unsigned int)idle_cpu < nr_cpumask_bits ? idle_cpu : id_backup; return idle_cpu; +out: +#ifdef CONFIG_GROUP_BALANCER + if (!gb_tried) { + gb_tried = true; + goto retry; + } +#endif + return -1; } /* diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c index 1a4a02f98991..3ef722098355 100644 --- a/kernel/sched/group_balancer.c +++ b/kernel/sched/group_balancer.c @@ -289,6 +289,11 @@ static inline struct cpumask *gb_sd_span(struct group_balancer_sched_domain *gb_ return to_cpumask(gb_sd->span); } +struct cpumask *get_gb_sd_span(struct group_balancer_sched_domain *gb_sd) +{ + return gb_sd_span(gb_sd); +} + static unsigned int get_size_level(struct group_balancer_sched_domain *gb_sd) { int size_level = ilog2(gb_sd->span_weight); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b21ea05b630..8b9eb580b9dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3769,6 +3769,7 @@ extern void gb_load_balance(struct lb_env *env); extern void task_tick_gb(struct task_struct *p); extern void util_est_reenqueue_all(void); extern void util_est_clear_all(void); +extern struct cpumask *get_gb_sd_span(struct group_balancer_sched_domain *gb_sd); #ifdef CONFIG_CFS_BANDWIDTH extern void tg_burst_change(struct task_group *tg, u64 burst); #endif -- Gitee