From a0c5474f1831ef5f9f5e3331b5b6e42d848b89b9 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Mon, 10 Nov 2025 21:04:55 +0800
Subject: [PATCH 01/11] anolis: sched: support Group Balancer to be aware of
 cpuset

ANBZ: #8765

When attach a task group to a group balancer sched domain, check
whether the intersection of gb_sd->span and cpuset->cpus_allowed
(if the cgroup has cpuset), satisfies the quota of the task group.
When the cpuset of cgroup changes, validate whether the group
balancer sched domain still satisfies.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 include/linux/sched.h         | 20 +++++++++++++
 kernel/cgroup/cpuset.c        | 41 ++++++++++++++++++++++++++
 kernel/sched/core.c           | 10 +++++++
 kernel/sched/group_balancer.c | 54 +++++++++++++++++++++++++++++------
 kernel/sched/sched.h          |  8 ------
 5 files changed, 117 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a1d6559bdb1d..89913709a766 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+struct cgroup;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -2424,4 +2425,23 @@ static inline bool jbd2_proxy_exec_disabled(void)
 {
 	return !static_branch_unlikely(&__jbd2_proxy_exec_enabled);
 }
+#ifdef CONFIG_GROUP_BALANCER
+extern bool group_balancer_enabled(void);
+extern void tg_specs_change(struct task_group *tg);
+extern bool tg_group_balancer_enabled(struct task_group *tg);
+extern struct task_group *cgroup_tg(struct cgroup *cgrp);
+extern struct cgroup *tg_cgroup(struct task_group *tg);
+extern void lock_cfs_constraints_mutex(void);
+extern void unlock_cfs_constraints_mutex(void);
+#ifdef CONFIG_CPUSETS
+extern struct cpumask *task_group_cpus_allowed(struct task_group *tg);
+#else
+static inline struct cpumask *task_group_cpus_allowed(struct task_group *tg)
+{
+	return NULL;
+}
+#endif
+#else
+static inline void tg_specs_change(struct task_group *tg) { }
+#endif
 #endif
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 164f5bee99da..bb9da7a87371 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -205,6 +205,46 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
 	return css ? container_of(css, struct cpuset, css) : NULL;
 }
 
+#ifdef CONFIG_GROUP_BALANCER
+static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
+{
+	return container_of(global_cgroup_css(cgrp, cpuset_cgrp_id),
+			    struct cpuset, css);
+}
+
+struct cpumask *task_group_cpus_allowed(struct task_group *tg)
+{
+	struct cgroup *cg = tg_cgroup(tg);
+	struct cpuset *cs = cgroup_cs(cg);
+
+	if (cs)
+		return (struct cpumask *)cs->cpus_allowed;
+
+	return NULL;
+}
+
+static void update_cpumask_for_group_balancer(struct cpuset *cs)
+{
+	struct cgroup *cg = cs->css.cgroup;
+	struct task_group *tg;
+
+	if (!group_balancer_enabled())
+		return;
+
+	tg = cgroup_tg(cg);
+	if (!tg)
+		return;
+	if (!tg_group_balancer_enabled(tg))
+		return;
+
+	lock_cfs_constraints_mutex();
+	tg_specs_change(tg);
+	unlock_cfs_constraints_mutex();
+}
+#else
+static inline void update_cpumask_for_group_balancer(struct cpuset *cs) { }
+#endif
+
 /* Retrieve the cpuset for a task */
 static inline struct cpuset *task_cs(struct task_struct *task)
 {
@@ -1498,6 +1538,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 		/* deleted = old - new = old & (~new) */
 		cpumask_andnot(&deleted, &old_cpus, tmp->new_cpus);
 		cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL);
+		update_cpumask_for_group_balancer(cs);
 
 		/*
 		 * On legacy hierarchy, if the effective cpumask of any non-
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fefc9e372b4d..5820be54e496 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10138,6 +10138,16 @@ static int validate_group_balancer(struct task_group *tg)
 	return retval;
 }
 
+void lock_cfs_constraints_mutex(void)
+{
+	mutex_lock(&cfs_constraints_mutex);
+}
+
+void unlock_cfs_constraints_mutex(void)
+{
+	mutex_unlock(&cfs_constraints_mutex);
+}
+
 static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css,
 					struct cftype *cftype, u64 new)
 {
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index cbbe57b8aefe..fa1ffd3f6c18 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -8,6 +8,7 @@
 #include "sched.h"
 #include <linux/log2.h>
 #include <linux/fs_context.h>
+#include <linux/cpuset.h>
 
 struct gb_lb_env {
 	int					src_cpu;
@@ -302,6 +303,40 @@ static void add_to_size_level(struct group_balancer_sched_domain *gb_sd)
 	__add_to_size_level(gb_sd, size_level);
 }
 
+bool tg_group_balancer_enabled(struct task_group *tg)
+{
+	return tg->group_balancer;
+}
+
+struct cgroup *tg_cgroup(struct task_group *tg)
+{
+	return tg->css.cgroup;
+}
+
+#ifdef CONFIG_CPUSETS
+static inline bool
+gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	struct cpumask *cpus_allowed = task_group_cpus_allowed(tg);
+	struct cpumask soft_cpus_allowed;
+	unsigned int soft_cpus_weight;
+
+	if (!cpus_allowed) {
+		soft_cpus_weight = gb_sd->span_weight;
+	} else {
+		cpumask_and(&soft_cpus_allowed, cpus_allowed, gb_sd_span(gb_sd));
+		soft_cpus_weight = cpumask_weight(&soft_cpus_allowed);
+	}
+	return tg->specs_ratio <= 100 * soft_cpus_weight;
+}
+#else
+static inline bool
+gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	return true;
+}
+#endif
+
 static int group_balancer_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -1402,8 +1437,9 @@ static unsigned long gb_sd_capacity(struct group_balancer_sched_domain *gb_sd)
 	return cap;
 }
 
-static struct group_balancer_sched_domain *select_idle_gb_sd(int specs)
+static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *tg)
 {
+	int specs = tg->specs_ratio;
 	struct group_balancer_sched_domain *gb_sd, *child;
 
 	if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100)
@@ -1418,7 +1454,7 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(int specs)
 		int max_unsatisfied_free_specs = INT_MIN;
 
 		for_each_gb_sd_child(child, gb_sd) {
-			if (child->span_weight * 100 >= specs &&
+			if (gb_sd_satisfies_task_group(tg, child) &&
 			    child->free_tg_specs > max_free_specs) {
 				max_free_child = child;
 				max_free_specs = child->free_tg_specs;
@@ -1460,7 +1496,7 @@ check_task_group_leap_level(struct task_group *tg, struct group_balancer_sched_d
 	int specs = tg->specs_ratio;
 
 	for_each_gb_sd_child(child, gb_sd) {
-		if (specs <= 100 * child->span_weight) {
+		if (gb_sd_satisfies_task_group(tg, child)) {
 			tg->leap_level = true;
 			tg->leap_level_timestamp = jiffies;
 			return;
@@ -1553,7 +1589,7 @@ int attach_tg_to_group_balancer_sched_domain(struct task_group *tg,
 
 	read_lock(&group_balancer_sched_domain_lock);
 	if (enable)
-		gb_sd = select_idle_gb_sd(tg->specs_ratio);
+		gb_sd = select_idle_gb_sd(tg);
 	else
 		gb_sd = target;
 	if (!gb_sd) {
@@ -1617,6 +1653,9 @@ static bool tg_lower_level(struct task_group *tg)
 		total_cap += child_cap;
 
 		tg_child_load = tg_gb_sd_load(tg, child);
+		tg_load += tg_child_load;
+		if (!gb_sd_satisfies_task_group(tg, child))
+			continue;
 		if (!dst || tg_child_load > tg_dst_load) {
 			dst = child;
 			tg_dst_load = tg_child_load;
@@ -1630,12 +1669,11 @@ static bool tg_lower_level(struct task_group *tg)
 				dst_cap = child_cap;
 			}
 		}
-		tg_load += tg_child_load;
 	}
 
 	if (tg_load == 0)
 		goto fail;
-	if (tg->specs_ratio > 100 * dst->span_weight)
+	if (!dst)
 		goto fail;
 #ifdef CONFIG_NUMA
 	/* We won't allow a task group span more than two numa nodes too long. */
@@ -1737,7 +1775,7 @@ void tg_specs_change(struct task_group *tg)
 		return;
 
 	/* This gb_sd still satisfy, don't do anything. */
-	if (specs <= gb_sd->span_weight * 100 || gb_sd == group_balancer_root_domain)
+	if (gb_sd_satisfies_task_group(tg, gb_sd) || gb_sd == group_balancer_root_domain)
 		return;
 
 	/* The specs doesn't satisfy anymore, upper to find a satisfied gb_sd. */
@@ -1748,7 +1786,7 @@ void tg_specs_change(struct task_group *tg)
 	}
 
 	for (; gb_sd; gb_sd = gb_sd->parent) {
-		if (specs <= gb_sd->span_weight * 100)
+		if (gb_sd_satisfies_task_group(tg, gb_sd))
 			break;
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 89cb253c8c9c..a1673af253ff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3713,7 +3713,6 @@ extern void sched_dynamic_update(int mode);
 #endif
 
 #ifdef CONFIG_GROUP_BALANCER
-extern bool group_balancer_enabled(void);
 extern bool group_balancer_rq_enabled(struct rq *rq);
 static inline const struct cpumask *task_allowed_cpu(struct task_struct *p)
 {
@@ -3738,11 +3737,6 @@ static inline void tg_inc_soft_cpus_version(struct task_group *tg)
 		tg->soft_cpus_version = 0;
 }
 
-static inline bool tg_group_balancer_enabled(struct task_group *tg)
-{
-	return tg->group_balancer;
-}
-
 extern void sched_init_group_balancer_sched_domains(void);
 extern void sched_clear_group_balancer_sched_domains(void);
 extern void tg_set_specs_ratio(struct task_group *tg);
@@ -3751,7 +3745,6 @@ extern int attach_tg_to_group_balancer_sched_domain(struct task_group *tg,
 						    bool enable);
 extern void detach_tg_from_group_balancer_sched_domain(struct task_group *tg, bool disable);
 extern void update_group_balancer_root_cpumask(void);
-extern void tg_specs_change(struct task_group *tg);
 extern unsigned long cfs_h_load(struct cfs_rq *cfs_rq);
 extern bool gb_cpu_overutilized(int cpu);
 extern void gb_load_balance(struct lb_env *env);
@@ -3766,7 +3759,6 @@ static inline const struct cpumask *task_allowed_cpu(struct task_struct *p)
 }
 static inline void tg_set_specs_ratio(struct task_group *tg) { }
 static inline void update_group_balancer_root_cpumask(void) { }
-static inline void tg_specs_change(struct task_group *tg) { }
 #ifdef CONFIG_SMP
 static inline void gb_load_balance(struct lb_env *env) { }
 #endif
-- 
Gitee


From abdff4ddc6c3bb03845a130a332433bb33afa4f7 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Tue, 11 Nov 2025 16:42:55 +0800
Subject: [PATCH 02/11] anolis: sched: support Group Balancer to be aware of
 cpu burst

ANBZ: #8765

For burstable task groups, soft_cpus need to burst, too, by moving
the task group to a higher level. So we introduce another rb_tree:
burstable_task_groups to queue burstable task groups, and if the
interval between load balances is too, short, we migrate burstable
task groups only.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/core.c           |   1 +
 kernel/sched/group_balancer.c | 273 ++++++++++++++++++++++++++--------
 kernel/sched/sched.h          |   3 +
 3 files changed, 217 insertions(+), 60 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5820be54e496..12a2cf71ee04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9592,6 +9592,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
 	raw_spin_lock_irq(&cfs_b->lock);
+	tg_burst_change(tg, burst);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
 	cfs_b->burst = burst;
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index fa1ffd3f6c18..2ffb1e23cb65 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -19,6 +19,9 @@ struct gb_lb_env {
 	unsigned long				nr_balance_failed;
 	enum migration_type			migration_type;
 	struct rb_root				task_groups;
+#ifdef CONFIG_CFS_BANDWIDTH
+	bool					burst;
+#endif
 
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
@@ -43,6 +46,10 @@ struct group_balancer_sched_domain {
 	unsigned int					depth;
 	raw_spinlock_t					lock;
 	struct rb_root					task_groups;
+#ifdef CONFIG_CFS_BANDWIDTH
+	struct rb_root					burstable_task_groups;
+	atomic_t					h_nr_burst_tg;
+#endif
 	struct kernfs_node				*kn;
 	unsigned long					last_balance_timestamp;
 	unsigned long					lower_interval;
@@ -337,6 +344,79 @@ gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_do
 }
 #endif
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline bool is_burstable_task_group(struct task_group *tg)
+{
+	return !!tg->cfs_bandwidth.burst;
+}
+
+static inline struct rb_root
+*gb_rb_root(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	if (unlikely(is_burstable_task_group(tg)))
+		return &gb_sd->burstable_task_groups;
+	return &gb_sd->task_groups;
+}
+static inline void update_h_nr_burst_tg(struct task_group *tg, bool add)
+{
+	struct group_balancer_sched_domain *gb_sd = tg->gb_sd;
+
+	if (!is_burstable_task_group(tg))
+		return;
+
+	for (; gb_sd; gb_sd = gb_sd->parent) {
+		if (add)
+			atomic_inc(&gb_sd->h_nr_burst_tg);
+		else
+			atomic_dec(&gb_sd->h_nr_burst_tg);
+	}
+}
+
+static inline bool tg_specs_less(struct rb_node *a, const struct rb_node *b);
+void tg_burst_change(struct task_group *tg, u64 burst)
+{
+	bool burst_before, burst_now;
+	struct group_balancer_sched_domain *gb_sd;
+
+	if (!group_balancer_enabled())
+		return;
+	if (!tg_group_balancer_enabled(tg))
+		return;
+
+	gb_sd = tg->gb_sd;
+	burst_before = !!tg->cfs_bandwidth.burst;
+	burst_now = !!burst;
+	if (burst_before == burst_now)
+		return;
+
+	read_lock(&group_balancer_sched_domain_lock);
+	raw_spin_lock(&gb_sd->lock);
+	if (!burst_before) {
+		rb_erase(&tg->gb_node, &gb_sd->task_groups);
+		rb_add(&tg->gb_node, &gb_sd->burstable_task_groups, tg_specs_less);
+		update_h_nr_burst_tg(tg, true);
+	} else {
+		rb_erase(&tg->gb_node, &gb_sd->burstable_task_groups);
+		rb_add(&tg->gb_node, &gb_sd->task_groups, tg_specs_less);
+		update_h_nr_burst_tg(tg, false);
+	}
+	raw_spin_unlock(&gb_sd->lock);
+	read_unlock(&group_balancer_sched_domain_lock);
+}
+#else
+static inline bool is_burstable_task_group(struct task_group *tg)
+{
+	return false;
+}
+
+static inline rb_root *gb_rb_root(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	return &gb_sd->task_groups;
+}
+
+static inline void update_h_nr_burst_tg(struct task_group *tg, bool add) { }
+#endif
+
 static int group_balancer_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -632,6 +712,7 @@ static inline struct group_balancer_sched_domain
 
 	raw_spin_lock_init(&new->lock);
 	new->task_groups = RB_ROOT;
+	new->burstable_task_groups = RB_ROOT;
 	new->imbalance_pct = 117;
 
 	return new;
@@ -679,12 +760,10 @@ static void add_to_tree(struct group_balancer_sched_domain *gb_sd,
 	}
 }
 
-#define __node_2_task_group(n) rb_entry((n), struct task_group, gb_node)
-
 static inline bool tg_specs_less(struct rb_node *a, const struct rb_node *b)
 {
-	struct task_group *tg_a = __node_2_task_group(a);
-	struct task_group *tg_b = __node_2_task_group(b);
+	struct task_group *tg_a = __gb_node_2_tg(a);
+	struct task_group *tg_b = __gb_node_2_tg(b);
 	int specs_a = tg_a->specs_ratio;
 	int specs_b = tg_b->specs_ratio;
 
@@ -718,17 +797,31 @@ static void free_group_balancer_sched_domain(struct group_balancer_sched_domain
 	struct task_group *tg;
 	struct group_balancer_sched_domain *parent = gb_sd->parent;
 	struct rb_node *node;
-	struct rb_root *root = &gb_sd->task_groups;
+	struct rb_root *roots[2] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+		&gb_sd->burstable_task_groups,
+#else
+		NULL,
+#endif
+		&gb_sd->task_groups,
+	};
+	struct rb_root *root;
+	int i;
 
 	if (parent) {
 		parent->nr_children--;
 		/* Move the task_groups to parent. */
-		while (!RB_EMPTY_ROOT(root)) {
-			node = root->rb_node;
-			tg = __node_2_task_group(node);
-			rb_erase(node, root);
-			rb_add(node, &parent->task_groups, tg_specs_less);
-			walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg);
+		for (i = 0; i < 2; i++) {
+			root = roots[i];
+			if (!root)
+				continue;
+			while (!RB_EMPTY_ROOT(root)) {
+				node = root->rb_node;
+				tg = __gb_node_2_tg(node);
+				rb_erase(node, root);
+				rb_add(node, &parent->task_groups, tg_specs_less);
+				walk_tg_tree_from(tg, tg_set_gb_tg_down, tg_nop, tg);
+			}
 		}
 	}
 
@@ -1533,9 +1626,12 @@ void add_tg_to_group_balancer_sched_domain_locked(struct task_group *tg,
 						  struct group_balancer_sched_domain *gb_sd,
 						  bool enable)
 {
-	tg->gb_sd = gb_sd;
-	rb_add(&tg->gb_node, &gb_sd->task_groups, tg_specs_less);
+	struct rb_root *root;
 
+	tg->gb_sd = gb_sd;
+	root = gb_rb_root(tg, gb_sd);
+	rb_add(&tg->gb_node, root, tg_specs_less);
+	update_h_nr_burst_tg(tg, true);
 	tg->soft_cpus_allowed_ptr = gb_sd_span(gb_sd);
 	tg_inc_soft_cpus_version(tg);
 	if (enable)
@@ -1560,9 +1656,12 @@ remove_tg_from_group_balancer_sched_domain_locked(struct task_group *tg,
 						  struct group_balancer_sched_domain *gb_sd,
 						  bool disable)
 {
-	tg->gb_sd = NULL;
-	rb_erase(&tg->gb_node, &gb_sd->task_groups);
+	struct rb_root *root = gb_rb_root(tg, gb_sd);
+
+	rb_erase(&tg->gb_node, root);
 	RB_CLEAR_NODE(&tg->gb_node);
+	tg->gb_sd = NULL;
+	update_h_nr_burst_tg(tg, false);
 	if (disable)
 		walk_tg_tree_from(tg, tg_unset_gb_tg_down, tg_nop, NULL);
 }
@@ -1846,49 +1945,75 @@ gb_detach_task_groups_from_gb_sd(struct gb_lb_env *gb_env,
 	struct task_group *tg, *n;
 	unsigned long load, util;
 	int detached = 0;
+	struct rb_root *roots[2] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+		&gb_sd->burstable_task_groups,
+#else
+		NULL,
+#endif
+		&gb_sd->task_groups,
+	};
+	int i, max_idx = 1;
+	struct rb_root *root;
 
 	raw_spin_lock(&gb_sd->lock);
-	/* Try the task cgroups with little specs first. */
-	gb_for_each_tg_safe(tg, n, &gb_sd->task_groups) {
-		if (!time_after(jiffies, tg->adjust_level_timestamp + 2 * gb_sd->lower_interval))
-			continue;
-		switch (gb_env->migration_type) {
-#ifdef CONFIG_GROUP_IDENTITY
-		case migrate_identity:
-			fallthrough;
+#ifdef CONFIG_CFS_BANDWIDTH
+	/*
+	 * When burst if true, the interval of load balance is too short,
+	 * we migrate burst task groups only.
+	 */
+	if (gb_env->burst)
+		max_idx = 0;
 #endif
-		case migrate_load:
-			load = tg_gb_sd_load(tg, gb_sd);
-			if (load == 0)
-				continue;
-			if (shr_bound(load, gb_env->nr_balance_failed) > gb_env->imbalance)
-				continue;
-			gb_env->imbalance -= load;
-			break;
-		case migrate_util:
-			util = tg_gb_sd_util(tg, gb_sd);
-			if (util == 0)
-				continue;
-			if (shr_bound(util, gb_env->nr_balance_failed) > gb_env->imbalance)
+	for (i = 0; i <= max_idx; i++) {
+		root = roots[i];
+		if (!root)
+			continue;
+		if (gb_env->burst && i == 1)
+			continue;
+		/* Try the task cgroups with little specs first. */
+		gb_for_each_tg_safe(tg, n, root) {
+			if (i > 0 && !time_after(jiffies,
+			    tg->adjust_level_timestamp + 2 * gb_sd->lower_interval))
 				continue;
-			gb_env->imbalance -= util;
-			break;
-		case migrate_task:
-			gb_env->imbalance = 0;
-			break;
-		/*TODO: Perfect strategy of migrate_misfit*/
-		case migrate_misfit:
-			gb_env->imbalance = 0;
-			break;
-		default:
-			break;
-		}
-		remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false);
-		rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less);
-		detached++;
-		if (gb_env->imbalance <= 0) {
-			raw_spin_unlock(&gb_sd->lock);
-			return detached;
+			switch (gb_env->migration_type) {
+	#ifdef CONFIG_GROUP_IDENTITY
+			case migrate_identity:
+				fallthrough;
+	#endif
+			case migrate_load:
+				load = tg_gb_sd_load(tg, gb_sd);
+				if (load == 0)
+					continue;
+				if (shr_bound(load, gb_env->nr_balance_failed) > gb_env->imbalance)
+					continue;
+				gb_env->imbalance -= load;
+				break;
+			case migrate_util:
+				util = tg_gb_sd_util(tg, gb_sd);
+				if (util == 0)
+					continue;
+				if (shr_bound(util, gb_env->nr_balance_failed) > gb_env->imbalance)
+					continue;
+				gb_env->imbalance -= util;
+				break;
+			case migrate_task:
+				gb_env->imbalance = 0;
+				break;
+			/*TODO: Perfect strategy of migrate_misfit*/
+			case migrate_misfit:
+				gb_env->imbalance = 0;
+				break;
+			default:
+				break;
+			}
+			remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false);
+			rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less);
+			detached++;
+			if (gb_env->imbalance <= 0) {
+				raw_spin_unlock(&gb_sd->lock);
+				return detached;
+			}
 		}
 	}
 	raw_spin_unlock(&gb_sd->lock);
@@ -1976,6 +2101,9 @@ void gb_load_balance(struct lb_env *env)
 	int gb_sd_status = 0;
 	struct cpumask *gb_mask = this_cpu_cpumask_var_ptr(group_balancer_mask);
 	unsigned long src_load, src_cap, dst_load, dst_cap;
+#ifdef CONFIG_CFS_BANDWIDTH
+	bool burst = false;
+#endif
 
 	if (!group_balancer_enabled())
 		return;
@@ -1999,8 +2127,14 @@ void gb_load_balance(struct lb_env *env)
 	if (!gb_sd)
 		goto unlock;
 
-	if (!time_after(jiffies, gb_sd->last_balance_timestamp + 2 * gb_sd->lower_interval))
-		goto unlock;
+	if (!time_after(jiffies, gb_sd->last_balance_timestamp + 2 * gb_sd->lower_interval)) {
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (atomic_read(&dst->h_nr_burst_tg))
+			burst = true;
+		else
+#endif
+			goto unlock;
+	}
 
 	src_load = gb_sd_load(src);
 	src_cap = gb_sd_capacity(src);
@@ -2019,6 +2153,9 @@ void gb_load_balance(struct lb_env *env)
 		.imbalance		= env->imbalance,
 		.nr_balance_failed	= env->sd->nr_balance_failed,
 		.task_groups		= RB_ROOT,
+#ifdef CONFIG_CFS_BANDWIDTH
+		.burst			= burst,
+#endif
 	};
 
 	/*
@@ -2026,10 +2163,26 @@ void gb_load_balance(struct lb_env *env)
 	 * and we don't migrate tg in this case.
 	 */
 	for (parent = gb_sd; parent; parent = parent->parent) {
-		for (node = rb_first(&parent->task_groups); node; node = rb_next(node)) {
-			tg = __node_2_task_group(node);
-			if (tg->cfs_rq[env->src_cpu]->h_nr_running)
-				goto unlock;
+		struct rb_root *roots[2] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+			&gb_sd->burstable_task_groups,
+#else
+			NULL,
+#endif
+			&gb_sd->task_groups,
+		};
+		struct rb_root *root;
+		int i;
+
+		for (i = 0; i < 2; i++) {
+			root = roots[i];
+			if (!root)
+				continue;
+			for (node = rb_first(root); node; node = rb_next(node)) {
+				tg = __gb_node_2_tg(node);
+				if (tg->cfs_rq[env->src_cpu]->h_nr_running)
+					goto unlock;
+			}
 		}
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a1673af253ff..b7ed5e0359ed 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3751,6 +3751,9 @@ extern void gb_load_balance(struct lb_env *env);
 extern void task_tick_gb(struct task_struct *p);
 extern void util_est_reenqueue_all(void);
 extern void util_est_clear_all(void);
+#ifdef CONFIG_CFS_BANDWIDTH
+extern void tg_burst_change(struct task_group *tg, u64 burst);
+#endif
 #else
 static inline bool group_balancer_rq_enabled(struct rq *rq) { return false; }
 static inline const struct cpumask *task_allowed_cpu(struct task_struct *p)
-- 
Gitee


From 296fcb6cbbd11db0b1e8f44632e1f04f3b107e4d Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Tue, 11 Nov 2025 20:47:57 +0800
Subject: [PATCH 03/11] anolis: sched: record preferred gb_sd for task group

ANBZ: #8765

If a group balancer sched domain just satisfies a task group, record
it as preferred_gb_sd, and when the task group lower level after
upper level, consider the preferred_gb_sd first. If the task group
stays in the upper level too long, make the preferred_gb_sd expire.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 include/linux/sched/sysctl.h  |  1 +
 kernel/sched/core.c           |  2 ++
 kernel/sched/group_balancer.c | 34 +++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h          |  2 ++
 kernel/sysctl.c               |  9 +++++++++
 5 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 536765522aec..c4991a577807 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -131,6 +131,7 @@ extern int sched_acpu_enable_handler(struct ctl_table *table, int write,
 #endif
 #ifdef CONFIG_GROUP_BALANCER
 extern unsigned int sysctl_sched_group_balancer_enabled;
+extern unsigned long sysctl_sched_gb_expiration_ms;
 extern int sched_group_balancer_enable_handler(struct ctl_table *table, int write,
 					       void __user *buffer, size_t *lenp,
 					       loff_t *ppos);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 12a2cf71ee04..c06619b98f99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9013,6 +9013,8 @@ struct task_group *sched_create_group(struct task_group *parent)
 	tg->group_balancer = 0;
 	tg->soft_cpus_version = 0;
 	tg->gb_sd = NULL;
+	tg->preferred_gb_sd = NULL;
+	tg->expiration_start = 0;
 	raw_spin_lock_init(&tg->gb_lock);
 #endif
 	return tg;
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 2ffb1e23cb65..d904ceb77669 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -276,6 +276,13 @@ struct group_balancer_sched_domain *group_balancer_root_domain;
 #define GB_OVERLOAD		0x1
 #define GB_OVERUTILIZED		0x2
 
+/*
+ * The time threshold that the preferred gb_sd expires.
+ * Unit: ms
+ * Default: 6000000
+ */
+unsigned long sysctl_sched_gb_expiration_ms = 60000;
+
 static inline struct cpumask *gb_sd_span(struct group_balancer_sched_domain *gb_sd)
 {
 	return to_cpumask(gb_sd->span);
@@ -417,6 +424,17 @@ static inline rb_root *gb_rb_root(struct task_group *tg, struct group_balancer_s
 static inline void update_h_nr_burst_tg(struct task_group *tg, bool add) { }
 #endif
 
+static inline bool
+is_preferred_gb_sd(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	struct group_balancer_sched_domain *p_gb_sd = tg->preferred_gb_sd;
+
+	if (!p_gb_sd)
+		return true;
+
+	return cpumask_subset(gb_sd_span(p_gb_sd), gb_sd_span(gb_sd));
+}
+
 static int group_balancer_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -1596,6 +1614,7 @@ check_task_group_leap_level(struct task_group *tg, struct group_balancer_sched_d
 		}
 	}
 
+	tg->preferred_gb_sd = gb_sd;
 	tg->leap_level = false;
 }
 
@@ -1774,6 +1793,16 @@ static bool tg_lower_level(struct task_group *tg)
 		goto fail;
 	if (!dst)
 		goto fail;
+	if (!is_preferred_gb_sd(tg, gb_sd)) {
+		/*
+		 * If the task group stays in the upper level for too long,
+		 * make the preferred gb sd to expire.
+		 */
+		if (!time_after(jiffies,
+		    tg->expiration_start + msecs_to_jiffies(sysctl_sched_gb_expiration_ms)))
+			goto fail;
+		tg->preferred_gb_sd = NULL;
+	}
 #ifdef CONFIG_NUMA
 	/* We won't allow a task group span more than two numa nodes too long. */
 	if (dst->gb_flags & GROUP_BALANCER_NUMA_FLAG)
@@ -1870,8 +1899,10 @@ void tg_specs_change(struct task_group *tg)
 
 	/* If the task group leaps level after specs change, we will lower it later. */
 	check_task_group_leap_level(tg, gb_sd);
-	if (tg->leap_level)
+	if (tg->leap_level) {
+		tg->preferred_gb_sd = NULL;
 		return;
+	}
 
 	/* This gb_sd still satisfy, don't do anything. */
 	if (gb_sd_satisfies_task_group(tg, gb_sd) || gb_sd == group_balancer_root_domain)
@@ -2008,6 +2039,7 @@ gb_detach_task_groups_from_gb_sd(struct gb_lb_env *gb_env,
 				break;
 			}
 			remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, false);
+			tg->expiration_start = jiffies;
 			rb_add(&tg->gb_node, &gb_env->task_groups, tg_specs_less);
 			detached++;
 			if (gb_env->imbalance <= 0) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b7ed5e0359ed..c064862c3a8e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -618,11 +618,13 @@ struct task_group {
 	int			specs_ratio;
 	struct rb_node		gb_node;
 	struct group_balancer_sched_domain *gb_sd;
+	struct group_balancer_sched_domain *preferred_gb_sd;
 	struct task_group	*gb_tg;
 	bool			group_balancer;
 	bool			leap_level;
 	unsigned long		leap_level_timestamp;
 	unsigned long		adjust_level_timestamp;
+	unsigned long		expiration_start;
 	raw_spinlock_t		gb_lock;
 #endif
 	long			priority;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff72c63f6129..fc0da990ae37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2116,6 +2116,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "sched_gb_expiration_ms",
+		.data		= &sysctl_sched_gb_expiration_ms,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &zero_ul,
+		.extra2		= &long_max,
+	},
 #endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
-- 
Gitee


From 0548109dfbb868c9ac2fa78f7054e38ce292d98f Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Tue, 11 Nov 2025 20:50:58 +0800
Subject: [PATCH 04/11] anolis: sched: force to lower level if a taskgroup
 spans LLC

ANBZ: #8765

Force to lower level if a taskgroup spans LLC instead of NUMA.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/group_balancer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index d904ceb77669..44225b259e83 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -1803,11 +1803,11 @@ static bool tg_lower_level(struct task_group *tg)
 			goto fail;
 		tg->preferred_gb_sd = NULL;
 	}
-#ifdef CONFIG_NUMA
+
 	/* We won't allow a task group span more than two numa nodes too long. */
-	if (dst->gb_flags & GROUP_BALANCER_NUMA_FLAG)
+	if (dst->gb_flags & GROUP_BALANCER_LLC_FLAG)
 		goto lower;
-#endif
+
 	/* If we lower the level, we have to make sure that we will not cause imbalance.
 	 *
 	 * src_load        dst_load
-- 
Gitee


From a928c3f8c638c51334d651fad2328ea6e4a24fa0 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Mon, 10 Nov 2025 11:23:32 +0800
Subject: [PATCH 05/11] anolis: sched: introduce rq->nr_gb_running

ANBZ: #8765

Introduce rq->nr_gb_running to indicate how many tasks in rq are
scheduled by group balancer.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/core.c  | 45 ++-----------------------
 kernel/sched/fair.c  | 80 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h | 16 +++++++++
 3 files changed, 99 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06619b98f99..6dbe01ba4c4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10109,38 +10109,6 @@ static u64 cpu_group_balancer_read_u64(struct cgroup_subsys_state *css,
 	return tg->group_balancer;
 }
 
-static int tg_validate_group_balancer_down(struct task_group *tg, void *data)
-{
-	if (tg->group_balancer)
-		return -EINVAL;
-	return 0;
-}
-
-/*
- * There is only one task group allowed to enable group balancer in the path from
- * root_task_group to a certion leaf task group.
- */
-static int validate_group_balancer(struct task_group *tg)
-{
-	int retval = 0;
-
-	rcu_read_lock();
-	retval = walk_tg_tree_from(tg, tg_validate_group_balancer_down,
-				   tg_nop, NULL);
-	if (retval)
-		goto out;
-
-	for (; tg != &root_task_group; tg = tg->parent) {
-		if (tg->group_balancer) {
-			retval = -EINVAL;
-			break;
-		}
-	}
-out:
-	rcu_read_unlock();
-	return retval;
-}
-
 void lock_cfs_constraints_mutex(void)
 {
 	mutex_lock(&cfs_constraints_mutex);
@@ -10174,16 +10142,9 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css,
 	if (old == new)
 		goto out;
 
-	if (new) {
-		retval = validate_group_balancer(tg);
-		if (retval)
-			goto out;
-		retval = attach_tg_to_group_balancer_sched_domain(tg, NULL, true);
-		if (retval)
-			goto out;
-	} else {
-		detach_tg_from_group_balancer_sched_domain(tg, true);
-	}
+	retval = update_group_balancer(tg, new);
+	if (retval)
+		goto out;
 	tg->group_balancer = new;
 out:
 	raw_spin_unlock(&tg->gb_lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ca96537f9e46..a18bdc61b105 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8571,6 +8571,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
 	id_update_nr_running(task_group(p), p, rq, 1);
+	gb_update_nr_running(task_group(p), rq, 1);
 
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -8697,6 +8698,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	/* At this point se is NULL and we are at root level*/
 	sub_nr_running(rq, 1);
 	id_update_nr_running(task_group(p), p, rq, -1);
+	gb_update_nr_running(task_group(p), rq, -1);
 
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -15302,3 +15304,81 @@ int sched_trace_rq_nr_running(struct rq *rq)
         return rq ? rq->nr_running : -1;
 }
 EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
+
+#ifdef CONFIG_GROUP_BALANCER
+static int tg_validate_group_balancer_down(struct task_group *tg, void *data)
+{
+	if (tg->group_balancer)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * There is only one task group allowed to enable group balancer in the path from
+ * root_task_group to a certion leaf task group.
+ */
+static int validate_group_balancer(struct task_group *tg)
+{
+	int retval = 0;
+
+	rcu_read_lock();
+	retval = walk_tg_tree_from(tg, tg_validate_group_balancer_down,
+				   tg_nop, NULL);
+	if (retval)
+		goto out;
+
+	for (; tg != &root_task_group; tg = tg->parent) {
+		if (tg->group_balancer) {
+			retval = -EINVAL;
+			break;
+		}
+	}
+out:
+	rcu_read_unlock();
+	return retval;
+}
+
+int update_group_balancer(struct task_group *tg, u64 new)
+{
+	int cpu, retval;
+	struct rq_flags rf;
+	unsigned int delta;
+
+	if (new) {
+		retval = validate_group_balancer(tg);
+		if (retval)
+			return retval;
+		retval = attach_tg_to_group_balancer_sched_domain(tg, NULL, true);
+		if (retval)
+			return retval;
+	} else {
+		detach_tg_from_group_balancer_sched_domain(tg, true);
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		bool on_rq, throttled;
+		struct rq *rq = cpu_rq(cpu);
+		struct cfs_rq *cfs_rq;
+		struct sched_entity *se;
+
+		rq_lock_irq(rq, &rf);
+		se = tg->se[cpu];
+		cfs_rq = cfs_rq_of(se);
+		throttled = throttled_hierarchy(cfs_rq);
+		delta = se->my_q->h_nr_running;
+		on_rq = se->on_rq;
+
+		if (on_rq && !throttled) {
+			if (new)
+				rq->nr_gb_running += delta;
+			else
+				rq->nr_gb_running -= delta;
+		}
+		rq_unlock_irq(rq, &rf);
+	}
+	cpus_read_unlock();
+
+	return 0;
+}
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c064862c3a8e..4193bfd0a92a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1549,6 +1549,8 @@ struct rq {
 
 #ifdef CONFIG_GROUP_BALANCER
 	struct group_balancer_sched_domain *gb_sd;
+	unsigned int		nr_gb_running;
+	long			nr_gb_make_up;
 	bool			group_balancer_enabled;
 #endif
 
@@ -2967,6 +2969,20 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 	sched_update_tick_dependency(rq);
 }
 
+#ifdef CONFIG_GROUP_BALANCER
+static inline void gb_update_nr_running(struct task_group *tg, struct rq *rq, int delta)
+{
+	if (!group_balancer_enabled())
+		return;
+	if (!tg || !tg_group_balancer_enabled(tg))
+		return;
+	rq->nr_gb_running += delta;
+}
+extern int update_group_balancer(struct task_group *tg, u64 new);
+#else
+static inline void gb_update_nr_running(struct task_group *tg, struct rq *rq, int delta) { }
+#endif
+
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
-- 
Gitee


From d0394a49e345ebfa063bce4a1b91b997d0a7face Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Wed, 12 Nov 2025 16:18:23 +0800
Subject: [PATCH 06/11] anolis: sched: correct free specs account

ANBZ: #8765

When the quota of the task group changes, update the free specs of
gb_sd. Change the type free_tg_specs into atomic_t to reduce spin lock
competition.

When select a idle gb_sd for a task group, hold a mutex lock to avoid
competition with other task groups.

Fixes: 7f8e0c71335f ("anolis: sched: maintain group balancer task groups")
Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 include/linux/sched.h         |  3 ++-
 kernel/cgroup/cpuset.c        |  2 +-
 kernel/sched/fair.c           | 20 ++++++++------
 kernel/sched/group_balancer.c | 51 +++++++++++++++++++++--------------
 4 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89913709a766..48fdda47b88d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2427,7 +2427,8 @@ static inline bool jbd2_proxy_exec_disabled(void)
 }
 #ifdef CONFIG_GROUP_BALANCER
 extern bool group_balancer_enabled(void);
-extern void tg_specs_change(struct task_group *tg);
+extern int get_tg_specs(struct task_group *tg);
+extern void tg_specs_change(struct task_group *tg, u64 specs_before);
 extern bool tg_group_balancer_enabled(struct task_group *tg);
 extern struct task_group *cgroup_tg(struct cgroup *cgrp);
 extern struct cgroup *tg_cgroup(struct task_group *tg);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index bb9da7a87371..d48296132e71 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -238,7 +238,7 @@ static void update_cpumask_for_group_balancer(struct cpuset *cs)
 		return;
 
 	lock_cfs_constraints_mutex();
-	tg_specs_change(tg);
+	tg_specs_change(tg, get_tg_specs(tg));
 	unlock_cfs_constraints_mutex();
 }
 #else
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a18bdc61b105..605af34adad7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -14734,19 +14734,18 @@ void free_fair_sched_group(struct task_group *tg)
 void tg_set_specs_ratio(struct task_group *tg)
 {
 	u64 quota = tg_cfs_bandwidth(tg)->hierarchical_quota;
-	u64 specs_ratio;
+	u64 specs_ratio, specs_before;
 
+	specs_before = tg->specs_ratio;
 	if (quota == RUNTIME_INF) {
 		tg->specs_ratio = -1;
-		return;
+	} else {
+		specs_ratio = quota / ((1 << BW_SHIFT) / 100);
+		/* If specs_ratio is bigger than INT_MAX, set specs_ratio -1. */
+		tg->specs_ratio = specs_ratio > INT_MAX ? -1 : specs_ratio;
 	}
-
-	specs_ratio = quota / ((1 << BW_SHIFT) / 100);
-
-	/* If specs_ratio is bigger than INT_MAX, set specs_ratio -1. */
-	tg->specs_ratio = specs_ratio > INT_MAX ? -1 : specs_ratio;
 	if (tg->group_balancer)
-		tg_specs_change(tg);
+		tg_specs_change(tg, specs_before);
 }
 #endif
 
@@ -15381,4 +15380,9 @@ int update_group_balancer(struct task_group *tg, u64 new)
 
 	return 0;
 }
+
+int get_tg_specs(struct task_group *tg)
+{
+	return tg->specs_ratio;
+}
 #endif
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 44225b259e83..6f536172152c 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -42,7 +42,7 @@ struct group_balancer_sched_domain {
 	unsigned int					span_weight;
 	unsigned int					nr_children;
 	/* If free_tg_specs is less than zero, the gb_sd is overloaded. */
-	int						free_tg_specs;
+	atomic_t					free_tg_specs;
 	unsigned int					depth;
 	raw_spinlock_t					lock;
 	struct rb_root					task_groups;
@@ -153,6 +153,7 @@ struct group_balancer_size_level {
 LIST_HEAD(group_balancer_sched_domains);
 
 DEFINE_RWLOCK(group_balancer_sched_domain_lock);
+DEFINE_MUTEX(group_balancer_select_lock);
 
 struct cpumask root_cpumask;
 
@@ -767,7 +768,7 @@ static void add_to_tree(struct group_balancer_sched_domain *gb_sd,
 	}
 	gb_sd->span_weight = cpumask_weight(gb_sd_span(gb_sd));
 	gb_sd->lower_interval = ilog2(gb_sd->span_weight) * gb_sd->span_weight;
-	gb_sd->free_tg_specs = 100 * gb_sd->span_weight;
+	atomic_set(&gb_sd->free_tg_specs, 100 * gb_sd->span_weight);
 	add_to_size_level(gb_sd);
 
 	if (!gb_sd->nr_children) {
@@ -1176,8 +1177,8 @@ static int build_group_balancer_sched_domains(void)
 		group_balancer_root_domain->lower_interval =
 			ilog2(group_balancer_root_domain->span_weight) *
 			group_balancer_root_domain->span_weight;
-		group_balancer_root_domain->free_tg_specs =
-			100 * group_balancer_root_domain->span_weight;
+		atomic_set(&group_balancer_root_domain->free_tg_specs,
+			100 * group_balancer_root_domain->span_weight);
 	}
 
 	if (!zalloc_cpumask_var(&trial_cpumask, GFP_KERNEL)) {
@@ -1565,14 +1566,16 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *
 		int max_unsatisfied_free_specs = INT_MIN;
 
 		for_each_gb_sd_child(child, gb_sd) {
+			int free_tg_specs = atomic_read(&child->free_tg_specs);
+
 			if (gb_sd_satisfies_task_group(tg, child) &&
-			    child->free_tg_specs > max_free_specs) {
+			    free_tg_specs > max_free_specs) {
 				max_free_child = child;
-				max_free_specs = child->free_tg_specs;
+				max_free_specs = free_tg_specs;
 			} else if (child->span_weight * 100 < specs &&
-				   child->free_tg_specs > max_unsatisfied_free_specs) {
+				   free_tg_specs > max_unsatisfied_free_specs) {
 				max_unsatisfied_free_child = child;
-				max_unsatisfied_free_specs = child->free_tg_specs;
+				max_unsatisfied_free_specs = free_tg_specs;
 			}
 		}
 		if (!max_free_child)
@@ -1622,13 +1625,8 @@ void update_free_tg_specs(struct group_balancer_sched_domain *gb_sd, int specs)
 {
 	struct group_balancer_sched_domain *parent;
 
-	if (specs != -1) {
-		for (parent = gb_sd; parent; parent = parent->parent) {
-			raw_spin_lock(&parent->lock);
-			parent->free_tg_specs += specs;
-			raw_spin_unlock(&parent->lock);
-		}
-	}
+	for (parent = gb_sd; parent; parent = parent->parent)
+		atomic_add(specs, &parent->free_tg_specs);
 }
 
 /*
@@ -1658,6 +1656,8 @@ void add_tg_to_group_balancer_sched_domain_locked(struct task_group *tg,
 
 	check_task_group_leap_level(tg, gb_sd);
 	tg->adjust_level_timestamp = jiffies;
+	if (tg->specs_ratio != -1)
+		update_free_tg_specs(gb_sd, -tg->specs_ratio);
 }
 
 void add_tg_to_group_balancer_sched_domain(struct task_group *tg,
@@ -1667,7 +1667,6 @@ void add_tg_to_group_balancer_sched_domain(struct task_group *tg,
 	raw_spin_lock(&gb_sd->lock);
 	add_tg_to_group_balancer_sched_domain_locked(tg, gb_sd, enable);
 	raw_spin_unlock(&gb_sd->lock);
-	update_free_tg_specs(gb_sd, -tg->specs_ratio);
 }
 
 static void
@@ -1683,6 +1682,8 @@ remove_tg_from_group_balancer_sched_domain_locked(struct task_group *tg,
 	update_h_nr_burst_tg(tg, false);
 	if (disable)
 		walk_tg_tree_from(tg, tg_unset_gb_tg_down, tg_nop, NULL);
+	if (tg->specs_ratio != -1)
+		update_free_tg_specs(gb_sd, tg->specs_ratio);
 }
 
 static void
@@ -1694,7 +1695,6 @@ remove_tg_from_group_balancer_sched_domain(struct task_group *tg,
 	raw_spin_lock(&gb_sd->lock);
 	remove_tg_from_group_balancer_sched_domain_locked(tg, gb_sd, disable);
 	raw_spin_unlock(&gb_sd->lock);
-	update_free_tg_specs(gb_sd, tg->specs_ratio);
 	read_unlock(&group_balancer_sched_domain_lock);
 }
 
@@ -1706,16 +1706,20 @@ int attach_tg_to_group_balancer_sched_domain(struct task_group *tg,
 	int ret = 0;
 
 	read_lock(&group_balancer_sched_domain_lock);
-	if (enable)
+	if (enable) {
+		mutex_lock(&group_balancer_select_lock);
 		gb_sd = select_idle_gb_sd(tg);
-	else
+	} else {
 		gb_sd = target;
+	}
 	if (!gb_sd) {
 		ret = -ESRCH;
 		goto out;
 	}
 	add_tg_to_group_balancer_sched_domain(tg, gb_sd, enable);
 out:
+	if (enable)
+		mutex_unlock(&group_balancer_select_lock);
 	read_unlock(&group_balancer_sched_domain_lock);
 	return ret;
 }
@@ -1887,7 +1891,7 @@ void task_tick_gb(struct task_struct *p)
 	raw_spin_unlock(&tg->gb_lock);
 }
 
-void tg_specs_change(struct task_group *tg)
+void tg_specs_change(struct task_group *tg, u64 specs_before)
 {
 	struct group_balancer_sched_domain *gb_sd;
 	int specs = tg->specs_ratio;
@@ -1897,6 +1901,13 @@ void tg_specs_change(struct task_group *tg)
 		/* tg->group_balancer is always true here, so find a gb_sd to attach. */
 		goto upper;
 
+	if (specs_before != specs) {
+		if (specs_before != -1)
+			update_free_tg_specs(gb_sd, specs_before);
+		if (specs != -1)
+			update_free_tg_specs(gb_sd, -specs);
+	}
+
 	/* If the task group leaps level after specs change, we will lower it later. */
 	check_task_group_leap_level(tg, gb_sd);
 	if (tg->leap_level) {
-- 
Gitee


From 77f966fcfc58900165d9fa5960c5236b5b760413 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Thu, 13 Nov 2025 11:06:45 +0800
Subject: [PATCH 07/11] anolis: sched: fix some bugs of group balancer

ANBZ: #8765

When reference max_unsatisfied_free_child, check whether it's NULL.

Init dst as NULL in tg_lower_level() to avoid invalid pointer reference.

Check whether the dst is leap level instead of parent in
tg_lower_level().

Fixes: 7f8e0c71335f ("anolis: sched: maintain group balancer task groups")
Fixes: bebcfc550d82 ("anolis: sched: introduce dynamical load balance for group balancer")
Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/group_balancer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 6f536172152c..37dc25a73261 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -1593,9 +1593,9 @@ static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *
 		 * specs cannot fully represent the degree of idleness if the span weight is
 		 * different.
 		 */
-		if (max_free_specs < specs &&
+		if (max_free_specs < specs && (!max_unsatisfied_free_child ||
 		    max_free_specs / max_free_child->span_weight <
-		    max_unsatisfied_free_specs / max_unsatisfied_free_child->span_weight)
+		    max_unsatisfied_free_specs / max_unsatisfied_free_child->span_weight))
 			break;
 		gb_sd = max_free_child;
 	}
@@ -1744,7 +1744,7 @@ static void tg_upper_level(struct task_group *tg, struct group_balancer_sched_do
 static bool tg_lower_level(struct task_group *tg)
 {
 	struct group_balancer_sched_domain *gb_sd = tg->gb_sd;
-	struct group_balancer_sched_domain *child, *dst;
+	struct group_balancer_sched_domain *child, *dst = NULL;
 	unsigned long tg_child_load, tg_load = 0, tg_dst_load = 0;
 	unsigned long child_load, src_load, dst_load, total_load = 0, migrate_load;
 	unsigned long child_cap, total_cap = 0, src_cap, dst_cap = 0;
@@ -1838,7 +1838,7 @@ static bool tg_lower_level(struct task_group *tg)
 	detach_tg_from_group_balancer_sched_domain(tg, false);
 	attach_tg_to_group_balancer_sched_domain(tg, dst, false);
 	/* The task group maybe still leap level, check it. */
-	check_task_group_leap_level(tg, gb_sd);
+	check_task_group_leap_level(tg, dst);
 
 	return true;
 fail:
-- 
Gitee


From 9be57cebd63e21190564e69e0e2b638cd519a72f Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Thu, 13 Nov 2025 18:48:34 +0800
Subject: [PATCH 08/11] anolis: sched: optimize load balance with nr_gb_running

ANBZ: #8765

When judge whether there is overload in a group balancer sched domain,
we'd better compare the sum of rq->nr_gb_running with gb_sd->span_weight,
where rq->nr_gb_running indicates the running tasks in group balancer
task groups.

If the lower of the task group won't cause overload, just lower it.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/group_balancer.c | 64 ++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 37dc25a73261..37033980c716 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -1549,6 +1549,32 @@ static unsigned long gb_sd_capacity(struct group_balancer_sched_domain *gb_sd)
 	return cap;
 }
 
+static unsigned int gb_sd_nr_running(struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	int nr_running = 0;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd))
+		nr_running += cpu_rq(cpu)->nr_gb_running;
+
+	return nr_running;
+}
+
+static unsigned int
+tg_gb_sd_nr_running(struct task_group *tg, struct group_balancer_sched_domain *gb_sd)
+{
+	int cpu;
+	int nr_running = 0;
+	struct cfs_rq *cfs_rq;
+
+	for_each_cpu(cpu, gb_sd_span(gb_sd)) {
+		cfs_rq = tg->cfs_rq[cpu];
+		nr_running += cfs_rq->h_nr_running;
+	}
+
+	return nr_running;
+}
+
 static struct group_balancer_sched_domain *select_idle_gb_sd(struct task_group *tg)
 {
 	int specs = tg->specs_ratio;
@@ -1748,6 +1774,8 @@ static bool tg_lower_level(struct task_group *tg)
 	unsigned long tg_child_load, tg_load = 0, tg_dst_load = 0;
 	unsigned long child_load, src_load, dst_load, total_load = 0, migrate_load;
 	unsigned long child_cap, total_cap = 0, src_cap, dst_cap = 0;
+	unsigned int child_nr_running, dst_nr_running = 0, tg_child_nr_running;
+	unsigned int tg_nr_running = 0, tg_dst_nr_running = 0, migrate_nr_running;
 	unsigned long src_imb, dst_imb;
 
 	if (!gb_sd)
@@ -1770,12 +1798,15 @@ static bool tg_lower_level(struct task_group *tg)
 	for_each_gb_sd_child(child, gb_sd) {
 		child_load = gb_sd_load(child);
 		total_load += child_load;
+		child_nr_running = gb_sd_nr_running(child);
 
 		child_cap = gb_sd_capacity(child);
 		total_cap += child_cap;
 
 		tg_child_load = tg_gb_sd_load(tg, child);
 		tg_load += tg_child_load;
+		tg_child_nr_running = tg_gb_sd_nr_running(tg, child);
+		tg_nr_running += tg_child_nr_running;
 		if (!gb_sd_satisfies_task_group(tg, child))
 			continue;
 		if (!dst || tg_child_load > tg_dst_load) {
@@ -1783,12 +1814,16 @@ static bool tg_lower_level(struct task_group *tg)
 			tg_dst_load = tg_child_load;
 			dst_load = child_load;
 			dst_cap = child_cap;
+			tg_dst_nr_running = tg_child_nr_running;
+			dst_nr_running = child_nr_running;
 		} else if (tg_child_load == tg_dst_load) {
 			if (dst_load * child_cap > child_load * dst_cap) {
 				dst = child;
 				tg_dst_load = tg_child_load;
 				dst_load = child_load;
 				dst_cap = child_cap;
+				tg_dst_nr_running = tg_child_nr_running;
+				dst_nr_running = child_nr_running;
 			}
 		}
 	}
@@ -1812,6 +1847,11 @@ static bool tg_lower_level(struct task_group *tg)
 	if (dst->gb_flags & GROUP_BALANCER_LLC_FLAG)
 		goto lower;
 
+	/* If migration won't cause overload, do migrate.*/
+	migrate_nr_running = tg_nr_running - tg_dst_nr_running;
+	if (dst_nr_running + migrate_nr_running <= dst->span_weight)
+		goto lower;
+
 	/* If we lower the level, we have to make sure that we will not cause imbalance.
 	 *
 	 * src_load        dst_load
@@ -2112,18 +2152,18 @@ static void gb_attach_task_groups(struct gb_lb_env *gb_env)
 
 static void __update_gb_sd_status(struct group_balancer_sched_domain *gb_sd, int *gb_sd_status)
 {
-	int i, nr_running;
+	int i, nr_gb_running = 0;
 
 	for_each_cpu(i, gb_sd_span(gb_sd)) {
 		struct rq *rq = cpu_rq(i);
 
-		nr_running = rq->nr_running;
-		if (nr_running > 1)
-			*gb_sd_status |= GB_OVERLOAD;
-
-		if (gb_cpu_overutilized(i))
-			*gb_sd_status |= GB_OVERUTILIZED;
+		nr_gb_running += rq->nr_gb_running;
+		/* TODO: Improve the utilization of GB_OVERUTILIZED.*/
+//		if (gb_cpu_overutilized(i))
+//			*gb_sd_status |= GB_OVERUTILIZED;
 	}
+	if (nr_gb_running > gb_sd->span_weight)
+		*gb_sd_status |= GB_OVERLOAD;
 }
 
 static void update_gb_sd_status(struct gb_lb_env *gb_env, int *gb_sd_status)
@@ -2147,6 +2187,7 @@ void gb_load_balance(struct lb_env *env)
 #ifdef CONFIG_CFS_BANDWIDTH
 	bool burst = false;
 #endif
+	int src_status = 0;
 
 	if (!group_balancer_enabled())
 		return;
@@ -2178,13 +2219,20 @@ void gb_load_balance(struct lb_env *env)
 #endif
 			goto unlock;
 	}
+	gb_sd->last_balance_timestamp = jiffies;
 
 	src_load = gb_sd_load(src);
 	src_cap = gb_sd_capacity(src);
 	dst_load = gb_sd_load(dst);
 	dst_cap = gb_sd_capacity(dst);
+	__update_gb_sd_status(src, &src_status);
 
-	if (dst_load * src_cap * gb_sd->imbalance_pct >= src_load * dst_cap * 100)
+	/*
+	 * If the imbalance isn't larger than imbalance_pct, and it isn't the case that
+	 * dst is idle and src is overload, don't do balance.
+	 */
+	if (dst_load * src_cap * gb_sd->imbalance_pct >= src_load * dst_cap * 100 &&
+	    !(available_idle_cpu(env->dst_cpu) && src_status))
 		goto unlock;
 
 	gb_env = (struct gb_lb_env){
-- 
Gitee


From 3378cc1120851c585c5ba15010692cdcc0f8db89 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Thu, 13 Nov 2025 23:22:35 +0800
Subject: [PATCH 09/11] anolis: sched: support cpu.group_balancer to be set to
 2

ANBZ: #8765

Setting cpu.group_balancer to 2 means that tg aquires double logical cpus.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/core.c           | 10 +++++++++-
 kernel/sched/group_balancer.c |  8 +++++---
 kernel/sched/sched.h          |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6dbe01ba4c4e..68c95c75c8c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10132,7 +10132,7 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css,
 	if (tg == &root_task_group || task_group_is_autogroup(tg))
 		return -EACCES;
 
-	if (new > 1)
+	if (new > 2)
 		return -EINVAL;
 
 	write_lock(&group_balancer_lock);
@@ -10142,6 +10142,14 @@ static int cpu_group_balancer_write_u64(struct cgroup_subsys_state *css,
 	if (old == new)
 		goto out;
 
+	if (!!old == !!new) {
+		mutex_lock(&cfs_constraints_mutex);
+		tg_specs_change(tg, tg->specs_ratio);
+		mutex_unlock(&cfs_constraints_mutex);
+		tg->group_balancer = new;
+		goto out;
+	}
+
 	retval = update_group_balancer(tg, new);
 	if (retval)
 		goto out;
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 37033980c716..d200284432bc 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -320,7 +320,7 @@ static void add_to_size_level(struct group_balancer_sched_domain *gb_sd)
 
 bool tg_group_balancer_enabled(struct task_group *tg)
 {
-	return tg->group_balancer;
+	return !!tg->group_balancer;
 }
 
 struct cgroup *tg_cgroup(struct task_group *tg)
@@ -342,7 +342,8 @@ gb_sd_satisfies_task_group(struct task_group *tg, struct group_balancer_sched_do
 		cpumask_and(&soft_cpus_allowed, cpus_allowed, gb_sd_span(gb_sd));
 		soft_cpus_weight = cpumask_weight(&soft_cpus_allowed);
 	}
-	return tg->specs_ratio <= 100 * soft_cpus_weight;
+	/* tg->group_balancer = 2 means that tg aquires double logical cpus. */
+	return tg->group_balancer * tg->specs_ratio <= 100 * soft_cpus_weight;
 }
 #else
 static inline bool
@@ -1961,7 +1962,8 @@ void tg_specs_change(struct task_group *tg, u64 specs_before)
 
 	/* The specs doesn't satisfy anymore, upper to find a satisfied gb_sd. */
 	/* Fast path, if the specs is -1 or too large, move it to root domain. */
-	if (specs == -1 || specs > group_balancer_root_domain->span_weight * 100) {
+	if (specs == -1 ||
+	    tg->group_balancer * specs > group_balancer_root_domain->span_weight * 100) {
 		gb_sd = group_balancer_root_domain;
 		goto upper;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4193bfd0a92a..6b21ea05b630 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -620,7 +620,7 @@ struct task_group {
 	struct group_balancer_sched_domain *gb_sd;
 	struct group_balancer_sched_domain *preferred_gb_sd;
 	struct task_group	*gb_tg;
-	bool			group_balancer;
+	unsigned int		group_balancer;
 	bool			leap_level;
 	unsigned long		leap_level_timestamp;
 	unsigned long		adjust_level_timestamp;
-- 
Gitee


From 1891410aa4877ad658ce17bde91b075714e107c3 Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Fri, 14 Nov 2025 10:23:55 +0800
Subject: [PATCH 10/11] anolis: sched: consider about specs balance for group
 balancer

ANBZ: #8765

When lower the level of a task group, consider whether it will cause
more imbalance between src free specs and dst free specs, to avoid
imbalance.

However, consider the case following: some task groups have large
specs but have low load, and in this case the specs loses its reference
value. So we introduce a sched feat to control whether we consider
about specs balance: GB_SPECS_BALANCE.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/features.h       |  4 ++++
 kernel/sched/group_balancer.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 8f30a023365f..d45ae1e86d16 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -119,6 +119,10 @@ SCHED_FEAT(SCHED_CORE_HT_AWARE_QUOTA, false)
 SCHED_FEAT(SCHED_CORE_VRUNTIME, false)
 #endif
 
+#ifdef CONFIG_GROUP_BALANCER
+SCHED_FEAT(GB_SPECS_BALANCE, false)
+#endif
+
 SCHED_FEAT(SCHED_FEAT_RESERVE1, false)
 SCHED_FEAT(SCHED_FEAT_RESERVE2, false)
 SCHED_FEAT(SCHED_FEAT_RESERVE3, false)
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index d200284432bc..1a4a02f98991 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -1778,6 +1778,9 @@ static bool tg_lower_level(struct task_group *tg)
 	unsigned int child_nr_running, dst_nr_running = 0, tg_child_nr_running;
 	unsigned int tg_nr_running = 0, tg_dst_nr_running = 0, migrate_nr_running;
 	unsigned long src_imb, dst_imb;
+	int total_free_specs = 0, child_free_specs = 0, dst_free_specs = 0, src_free_specs = 0;
+	int tg_specs;
+	unsigned int src_span_weight, dst_span_weight;
 
 	if (!gb_sd)
 		goto fail;
@@ -1808,6 +1811,9 @@ static bool tg_lower_level(struct task_group *tg)
 		tg_load += tg_child_load;
 		tg_child_nr_running = tg_gb_sd_nr_running(tg, child);
 		tg_nr_running += tg_child_nr_running;
+
+		child_free_specs = atomic_read(&child->free_tg_specs);
+		total_free_specs += child_free_specs;
 		if (!gb_sd_satisfies_task_group(tg, child))
 			continue;
 		if (!dst || tg_child_load > tg_dst_load) {
@@ -1817,6 +1823,7 @@ static bool tg_lower_level(struct task_group *tg)
 			dst_cap = child_cap;
 			tg_dst_nr_running = tg_child_nr_running;
 			dst_nr_running = child_nr_running;
+			dst_free_specs = child_free_specs;
 		} else if (tg_child_load == tg_dst_load) {
 			if (dst_load * child_cap > child_load * dst_cap) {
 				dst = child;
@@ -1825,6 +1832,7 @@ static bool tg_lower_level(struct task_group *tg)
 				dst_cap = child_cap;
 				tg_dst_nr_running = tg_child_nr_running;
 				dst_nr_running = child_nr_running;
+				dst_free_specs = child_free_specs;
 			}
 		}
 	}
@@ -1873,6 +1881,31 @@ static bool tg_lower_level(struct task_group *tg)
 
 	if (dst_imb > src_imb)
 		goto fail;
+
+	if (!sched_feat(GB_SPECS_BALANCE))
+		goto lower;
+	/*
+	 * If we lower the level, we'd better guarantee that free specs won't be more imbalance.
+	 *
+	 * src_free_specs	dst_free_specs
+	 * ---------------  vs  --------------
+	 * src_span_weight	dst_span_weight
+	 *
+	 */
+	tg_specs = tg->specs_ratio;
+	src_free_specs = total_free_specs - dst_free_specs;
+	dst_span_weight = dst->span_weight;
+	src_span_weight = gb_sd->span_weight - dst_span_weight;
+	src_imb = abs(src_free_specs * dst_span_weight - dst_free_specs * src_span_weight);
+	dst_imb = abs(src_free_specs * dst_span_weight -
+		      (dst_free_specs - tg_specs) * src_span_weight);
+
+	if (dst_free_specs * src_span_weight > src_free_specs * dst_span_weight)
+		goto fail;
+
+	if (dst_imb > src_imb)
+		goto fail;
+
 #ifdef CONFIG_NUMA
 lower:
 #endif
-- 
Gitee


From cdc1ec5fdd9300a7c8d3888b63aa72a168c6b5ce Mon Sep 17 00:00:00 2001
From: CruzZhao <CruzZhao@linux.alibaba.com>
Date: Fri, 14 Nov 2025 14:37:16 +0800
Subject: [PATCH 11/11] anolis: sched: try to find idle cpu in preferred_gb_sd
 first

ANBZ: #8765

When select idle cpu, try to find an idle cpu in preferred_gb_sd
first, if no, try to find one in llc.

Signed-off-by: CruzZhao <CruzZhao@linux.alibaba.com>
---
 kernel/sched/fair.c           | 34 ++++++++++++++++++++++++++++++++--
 kernel/sched/group_balancer.c |  5 +++++
 kernel/sched/sched.h          |  1 +
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 605af34adad7..a3fdcca1bd57 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9242,12 +9242,27 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co
 	struct sched_domain *this_sd;
 	u64 time;
 	bool is_seeker;
+#ifdef CONFIG_GROUP_BALANCER
+	struct task_group *tg = task_group(p);
+	bool gb_tried = false;
+	struct group_balancer_sched_domain *preferred = tg->preferred_gb_sd;
+#endif
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
 		return -1;
 
+#ifdef CONFIG_GROUP_BALANCER
+retry:
+	if (group_balancer_enabled() && !gb_tried && tg_group_balancer_enabled(tg) && preferred) {
+		cpumask_and(cpus, get_gb_sd_span(preferred), task_allowed_cpu(p));
+	} else {
+		gb_tried = true;
+		cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p));
+	}
+#else
 	cpumask_and(cpus, sched_domain_span(sd), task_allowed_cpu(p));
+#endif
 
 	if (sched_feat(SIS_PROP) && !has_idle_core) {
 		u64 avg_cost, avg_idle, span_avg;
@@ -9284,7 +9299,7 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co
 						return i;
 				} else {
 					if (--nr <= 0)
-						return -1;
+						goto out;
 					idle_cpu = __select_idle_cpu(cpu, p, &id_backup);
 					if ((unsigned int)idle_cpu < nr_cpumask_bits)
 						return idle_cpu;
@@ -9301,13 +9316,20 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co
 				return i;
 		} else {
 			if (--nr <= 0)
-				return -1;
+				goto out;
 			idle_cpu = __select_idle_cpu(cpu, p, &id_backup);
 			if ((unsigned int)idle_cpu < nr_cpumask_bits)
 				break;
 		}
 	}
 
+#ifdef CONFIG_GROUP_BALANCER
+	if (!gb_tried) {
+		gb_tried = true;
+		goto retry;
+	}
+#endif
+
 	if (has_idle_core)
 		set_idle_cores(target, false);
 
@@ -9319,6 +9341,14 @@ select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_co
 	if (!group_identity_disabled())
 		return (unsigned int)idle_cpu < nr_cpumask_bits ? idle_cpu : id_backup;
 	return idle_cpu;
+out:
+#ifdef CONFIG_GROUP_BALANCER
+	if (!gb_tried) {
+		gb_tried = true;
+		goto retry;
+	}
+#endif
+	return -1;
 }
 
 /*
diff --git a/kernel/sched/group_balancer.c b/kernel/sched/group_balancer.c
index 1a4a02f98991..3ef722098355 100644
--- a/kernel/sched/group_balancer.c
+++ b/kernel/sched/group_balancer.c
@@ -289,6 +289,11 @@ static inline struct cpumask *gb_sd_span(struct group_balancer_sched_domain *gb_
 	return to_cpumask(gb_sd->span);
 }
 
+struct cpumask *get_gb_sd_span(struct group_balancer_sched_domain *gb_sd)
+{
+	return gb_sd_span(gb_sd);
+}
+
 static unsigned int get_size_level(struct group_balancer_sched_domain *gb_sd)
 {
 	int size_level = ilog2(gb_sd->span_weight);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6b21ea05b630..8b9eb580b9dc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3769,6 +3769,7 @@ extern void gb_load_balance(struct lb_env *env);
 extern void task_tick_gb(struct task_struct *p);
 extern void util_est_reenqueue_all(void);
 extern void util_est_clear_all(void);
+extern struct cpumask *get_gb_sd_span(struct group_balancer_sched_domain *gb_sd);
 #ifdef CONFIG_CFS_BANDWIDTH
 extern void tg_burst_change(struct task_group *tg, u64 burst);
 #endif
-- 
Gitee