From e5bf95d5b9168670dd4fef0b360b3e66361529e6 Mon Sep 17 00:00:00 2001
From: Tianchen Ding <dtcccc@linux.alibaba.com>
Date: Wed, 9 Mar 2022 14:58:23 +0800
Subject: [PATCH 1/6] anolis: sched: introduce ACPU accounting

ANBZ: #6729

When SMT is on, tasks will be disturbed by the tasks on it's SMT
sibling, which will make the tasks running sometimes fast and sometimes
slowly. So far, there isn't any way to assess how much disturbance the
task has received.

To assess the SMT disturbance, we introduce ACPU(assess CPU), which will
account how long the task is running with SMT sibling idle. The
statistical data is shown in /proc/<pid>/sched, row se.core_sibidletime.
Only when kernel.sched_schedstats is on, the data will be counted and
shown.

Co-developed-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 include/linux/kernel_stat.h |  6 +++
 include/linux/sched.h       | 10 +++++
 kernel/sched/core.c         | 90 +++++++++++++++++++++++++++++++++++++
 kernel/sched/cputime.c      |  8 ++++
 kernel/sched/debug.c        |  3 ++
 kernel/sched/sched.h        |  6 +++
 kernel/smpboot.c            |  1 +
 lib/Kconfig.debug           |  7 +++
 8 files changed, 131 insertions(+)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index ca7ac6734c41..90d4b8c4c94d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -30,6 +30,9 @@ enum cpu_usage_stat {
 	CPUTIME_GUEST_NICE,
 #ifdef CONFIG_SCHED_CORE
 	CPUTIME_FORCEIDLE,
+#endif
+#ifdef CONFIG_SCHED_ACPU
+	CPUTIME_SIBIDLE,
 #endif
 	NR_STATS,
 };
@@ -122,5 +125,8 @@ extern void account_idle_ticks(unsigned long ticks);
 #ifdef CONFIG_SCHED_CORE
 extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
 #endif
+#ifdef CONFIG_SCHED_ACPU
+extern void __account_sibidle_time(struct task_struct *tsk, u64 delta);
+#endif
 
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5b04709f47b2..260568830e3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -500,6 +500,10 @@ struct sched_statistics {
 	u64				core_forceidle_sum;
 #endif
 
+#ifdef CONFIG_SCHED_ACPU
+	u64				core_sibidle_sum;
+#endif
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -2347,4 +2351,10 @@ static inline void sched_core_fork(struct task_struct *p) { }
 static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
 #endif
 
+#ifdef CONFIG_SCHED_ACPU
+extern void acpu_enable(void);
+#else
+static inline void acpu_enable(void) { }
+#endif
+
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a55f28d876d0..ae32d0780f26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -77,6 +77,10 @@ unsigned int sysctl_sched_cfs_bw_burst_onset_percent;
 unsigned int sysctl_sched_cfs_bw_burst_enabled = 1;
 #endif
 
+#ifdef CONFIG_SCHED_ACPU
+DEFINE_STATIC_KEY_FALSE(acpu_enabled);
+#endif
+
 /*
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
@@ -3913,6 +3917,84 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 
+#ifdef CONFIG_SCHED_ACPU
+void acpu_enable(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		/* It may be not that accurate, but useful enough. */
+		rq->last_acpu_update_time = rq->clock;
+	}
+	static_branch_enable(&acpu_enabled);
+}
+
+static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+	const int cpu = cpu_of(rq);
+	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+	u64 now = rq_clock(rq);
+	u64 sibidle_sum, last_update_time;
+	s64 delta, last;
+	int i;
+
+	if (!static_branch_likely(&acpu_enabled) || !schedstat_enabled())
+		return;
+
+	/* Update idle sum and busy sum for current rq. */
+	delta = now - rq->last_acpu_update_time;
+	if (prev == rq->idle)
+		rq->acpu_idle_sum += delta;
+
+	/*
+	 * Be carefule, smt_mask maybe NULL.
+	 * We only consider the case where there are two SMT at this stage.
+	 */
+	if (unlikely(!smt_mask) || unlikely(cpumask_weight(smt_mask) != 2))
+		goto out;
+
+	for_each_cpu(i, smt_mask) {
+		if (i != cpu) {
+			struct rq *rq_i = cpu_rq(i);
+			struct task_struct *curr_i = rq_i->curr;
+
+			last = (s64)(rq->last_acpu_update_time -
+				     rq_i->last_acpu_update_time);
+			last_update_time = last >= 0 ? rq->last_acpu_update_time :
+						       rq_i->last_acpu_update_time;
+			/*
+			 * Sibling may update acpu at the same time, and it's
+			 * timestamp may be newer than this rq.
+			 */
+			delta = now - last_update_time;
+			delta = delta > 0 ? delta : 0;
+
+			/* Add the delta to improve accuracy. */
+			sibidle_sum = last >= 0 ? rq->sibidle_sum : rq_i->acpu_idle_sum;
+			if (curr_i == rq_i->idle)
+				sibidle_sum += delta;
+		}
+	}
+
+	if (prev != rq->idle) {
+		delta = sibidle_sum - rq->sibidle_sum;
+		delta = delta > 0 ? delta : 0;
+		__account_sibidle_time(prev, delta);
+	}
+
+	if (next != rq->idle)
+		rq->sibidle_sum = sibidle_sum;
+out:
+	rq->last_acpu_update_time = now;
+}
+#else
+static inline void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+}
+#endif /* CONFIG_SCHED_ACPU */
+
 static inline void prepare_task(struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -4004,6 +4086,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 {
 	kcov_prepare_switch(prev);
 	sched_info_switch(rq, prev, next);
+	update_acpu(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
@@ -4456,6 +4539,7 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	update_acpu(rq, curr, curr);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 	sched_core_tick(rq);
@@ -8171,6 +8255,12 @@ void __init sched_init(void)
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
 #endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SCHED_ACPU
+		rq->acpu_idle_sum = 0;
+		rq->sibidle_sum = 0;
+		rq->last_acpu_update_time = rq->clock;
+#endif
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
 #if defined(CONFIG_GROUP_IDENTITY) && defined(CONFIG_SCHED_SMT)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 48a0a4fa02ef..56b446412467 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -238,6 +238,14 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
 	task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
 }
 #endif
+#ifdef CONFIG_SCHED_ACPU
+void __account_sibidle_time(struct task_struct *p, u64 delta)
+{
+	__schedstat_add(p->se.statistics.core_sibidle_sum, delta);
+
+	task_group_account_field(p, CPUTIME_SIBIDLE, delta);
+}
+#endif
 
 /*
  * When a guest is interrupted for a longer amount of time, missed clock
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9d30ea76022e..3e1fd969ff9b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1025,6 +1025,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 #ifdef CONFIG_SCHED_CORE
 		PN_SCHEDSTAT(se.statistics.core_forceidle_sum);
+#endif
+#ifdef CONFIG_SCHED_ACPU
+		PN_SCHEDSTAT(se.statistics.core_sibidle_sum);
 #endif
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 448c3fd61680..c27595620fb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1319,6 +1319,12 @@ struct rq {
 	struct task_struct	*force_idled_core_pick;
 #endif
 
+#ifdef CONFIG_SCHED_ACPU
+	u64 acpu_idle_sum;
+	u64 sibidle_sum;
+	u64 last_acpu_update_time;
+#endif
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e4163042c4d6..cd94ea587d3e 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -73,6 +73,7 @@ void __init idle_threads_init(void)
 		if (cpu != boot_cpu)
 			idle_init(cpu);
 	}
+	acpu_enable();
 }
 #endif
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7c63c6787adf..6684c5a2e71f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1132,6 +1132,13 @@ config SCHEDSTATS_HOST
 	  this config. If Y here, the default value of it is 1, and if N,
 	  the value is 0.
 
+config SCHED_ACPU
+	bool "ACPU info: account idle time of smt to task"
+	depends on DEBUG_KERNEL && PROC_FS && SMP && SCHED_SMT
+	default y
+	help
+	  Add ACPU info in /proc/<pid>/sched.
+
 endmenu
 
 config DEBUG_TIMEKEEPING
-- 
Gitee


From bf3d3eb3483fe1594772dc47bb84199d412b3492 Mon Sep 17 00:00:00 2001
From: Cruz Zhao <CruzZhao@linux.alibaba.com>
Date: Tue, 19 Sep 2023 17:22:19 +0800
Subject: [PATCH 2/6] anolis: sched: introduce sysctl_sched_acpu_enabled

ANBZ: #6729

In order to be able to dynamically turn on and off acpu accounting, we
introduce sysctl_sched_acpu_enabled, instead of default on.

Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 include/linux/sched.h        |  6 ------
 include/linux/sched/sysctl.h |  6 ++++++
 kernel/sched/core.c          | 32 +++++++++++++++++++++++++++++++-
 kernel/smpboot.c             |  1 -
 kernel/sysctl.c              | 11 +++++++++++
 5 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 260568830e3d..16eebd11f3ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2351,10 +2351,4 @@ static inline void sched_core_fork(struct task_struct *p) { }
 static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
 #endif
 
-#ifdef CONFIG_SCHED_ACPU
-extern void acpu_enable(void);
-#else
-static inline void acpu_enable(void) { }
-#endif
-
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e6dbcbd03b8d..9fcea8ed4068 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -119,4 +119,10 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SCHED_ACPU
+extern unsigned int sysctl_sched_acpu_enabled;
+extern int sched_acpu_enable_handler(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos);
+#endif
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ae32d0780f26..419e0ae7d0d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -79,6 +79,7 @@ unsigned int sysctl_sched_cfs_bw_burst_enabled = 1;
 
 #ifdef CONFIG_SCHED_ACPU
 DEFINE_STATIC_KEY_FALSE(acpu_enabled);
+unsigned int sysctl_sched_acpu_enabled;
 #endif
 
 /*
@@ -3918,7 +3919,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 
 #ifdef CONFIG_SCHED_ACPU
-void acpu_enable(void)
+static void acpu_enable(void)
 {
 	int i;
 
@@ -3931,6 +3932,35 @@ void acpu_enable(void)
 	static_branch_enable(&acpu_enabled);
 }
 
+static void acpu_disable(void)
+{
+	static_branch_disable(&acpu_enabled);
+}
+
+int sched_acpu_enable_handler(struct ctl_table *table, int write, void __user *buffer,
+			      size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	unsigned int old, new;
+
+	if (!write) {
+		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+		return ret;
+	}
+
+	old = sysctl_sched_acpu_enabled;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	new = sysctl_sched_acpu_enabled;
+	if (!ret && write && (old != new)) {
+		if (new)
+			acpu_enable();
+		else
+			acpu_disable();
+	}
+
+	return ret;
+}
+
 static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
 	const int cpu = cpu_of(rq);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index cd94ea587d3e..e4163042c4d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -73,7 +73,6 @@ void __init idle_threads_init(void)
 		if (cpu != boot_cpu)
 			idle_init(cpu);
 	}
-	acpu_enable();
 }
 #endif
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ec38014b9ac1..1fbce664ea31 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2038,6 +2038,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
+#ifdef CONFIG_SCHED_ACPU
+	{
+		.procname	= "sched_acpu",
+		.data		= &sysctl_sched_acpu_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_acpu_enable_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_SCHED_ACPU*/
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
Gitee


From 74c97b025e5af384395ed4d90711f9f971077d74 Mon Sep 17 00:00:00 2001
From: Cruz Zhao <CruzZhao@linux.alibaba.com>
Date: Sat, 30 Sep 2023 14:52:52 +0800
Subject: [PATCH 3/6] anolis: sched: account sibidle for core scheduling

ANBZ: #6729

Accounting for sibling idle time for core scheduling, which is time
where a cookie'd task running with SMT sibling idle, including sibling
forced idle time and sibling real idle time, collectively called sibidle.

A few details:
 - For SMT > 2, we scale the amount of idle charged based on the number
   of idle siblings in function account_sibidle_time(). Additionally,
   we split the time up and evenly charge it to all running tasks, as
   each is equally responsible forthe idle.
 - When core sched is enabled and sibidle count is not zero, we account
   sibidle in function account_sibidle_time(), otherwise in function
   update_acpu().

Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 include/linux/kernel_stat.h |  9 +++----
 include/linux/sched.h       |  2 +-
 kernel/sched/core.c         | 50 ++++++++++++++++++++++++-------------
 kernel/sched/core_sched.c   | 38 +++++++++++++++++-----------
 kernel/sched/cputime.c      | 19 ++++++--------
 kernel/sched/sched.h        | 18 ++++++-------
 6 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 90d4b8c4c94d..3b3a4467e006 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -31,7 +31,7 @@ enum cpu_usage_stat {
 #ifdef CONFIG_SCHED_CORE
 	CPUTIME_FORCEIDLE,
 #endif
-#ifdef CONFIG_SCHED_ACPU
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
 	CPUTIME_SIBIDLE,
 #endif
 	NR_STATS,
@@ -122,11 +122,8 @@ extern void account_process_tick(struct task_struct *, int user);
 
 extern void account_idle_ticks(unsigned long ticks);
 
-#ifdef CONFIG_SCHED_CORE
-extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
-#endif
-#ifdef CONFIG_SCHED_ACPU
-extern void __account_sibidle_time(struct task_struct *tsk, u64 delta);
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+extern void __account_sibidle_time(struct task_struct *tsk, u64 delta, bool fi);
 #endif
 
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 16eebd11f3ae..9dcfe12be015 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -500,7 +500,7 @@ struct sched_statistics {
 	u64				core_forceidle_sum;
 #endif
 
-#ifdef CONFIG_SCHED_ACPU
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU)
 	u64				core_sibidle_sum;
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 419e0ae7d0d6..f803bef95640 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -305,7 +305,7 @@ static void __sched_core_flip(bool enabled)
 		for_each_cpu(t, smt_mask)
 			cpu_rq(t)->core_enabled = enabled;
 
-		cpu_rq(cpu)->core->core_forceidle_start = 0;
+		cpu_rq(cpu)->core->core_sibidle_start = 0;
 
 		sched_core_unlock(cpu, &flags);
 
@@ -3973,6 +3973,15 @@ static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_str
 	if (!static_branch_likely(&acpu_enabled) || !schedstat_enabled())
 		return;
 
+	/*
+	 * If core sched is enabled and core_sibidle_count is not zero, we update sibidle
+	 * time in function __sched_core_account_sibidle().
+	 */
+#ifdef CONFIG_SCHED_CORE
+	if (rq->core->core_sibidle_count)
+		goto out;
+#endif
+
 	/* Update idle sum and busy sum for current rq. */
 	delta = now - rq->last_acpu_update_time;
 	if (prev == rq->idle)
@@ -4011,7 +4020,7 @@ static void update_acpu(struct rq *rq, struct task_struct *prev, struct task_str
 	if (prev != rq->idle) {
 		delta = sibidle_sum - rq->sibidle_sum;
 		delta = delta > 0 ? delta : 0;
-		__account_sibidle_time(prev, delta);
+		__account_sibidle_time(prev, delta, false);
 	}
 
 	if (next != rq->idle)
@@ -5036,18 +5045,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	/* reset state */
 	rq->core->core_cookie = 0UL;
-	if (rq->core->core_forceidle_count) {
+	if (rq->core->core_sibidle_count) {
 		if (!core_clock_updated) {
 			update_rq_clock(rq->core);
 			core_clock_updated = true;
 		}
-		sched_core_account_forceidle(rq);
+		sched_core_account_sibidle(rq);
 		/* reset after accounting force idle */
-		rq->core->core_forceidle_start = 0;
-		rq->core->core_forceidle_count = 0;
-		rq->core->core_forceidle_occupation = 0;
-		need_sync = true;
-		fi_before = true;
+		rq->core->core_sibidle_start = 0;
+		rq->core->core_sibidle_count = 0;
+		rq->core->core_sibidle_occupation = 0;
+		if (rq->core->core_forceidle_count) {
+			rq->core->core_forceidle_count = 0;
+			need_sync = true;
+			fi_before = true;
+		}
 	}
 
 	/*
@@ -5123,6 +5135,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		rq_i->core_pick = p;
 
 		if (p == rq_i->idle) {
+			rq->core->core_sibidle_count++;
 			if (rq_i->nr_running) {
 				rq->core->core_forceidle_count++;
 				if (!fi_before)
@@ -5133,9 +5146,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		}
 	}
 
-	if (schedstat_enabled() && rq->core->core_forceidle_count) {
-		rq->core->core_forceidle_start = rq_clock(rq->core);
-		rq->core->core_forceidle_occupation = occ;
+	if (schedstat_enabled() && rq->core->core_sibidle_count) {
+		rq->core->core_sibidle_start = rq_clock(rq->core);
+		rq->core->core_sibidle_occupation = occ;
 	}
 
 	rq->core->core_pick_seq = rq->core->core_task_seq;
@@ -5177,7 +5190,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		if (!(fi_before && rq->core->core_forceidle_count))
 			task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
 
-		rq_i->core_pick->core_occupation = occ;
+		if (rq->core->core_forceidle_count)
+			rq_i->core_pick->core_occupation = occ;
 
 		if (i == cpu) {
 			rq_i->core_pick = NULL;
@@ -5397,14 +5411,15 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
 	core_rq->core_cookie               = rq->core_cookie;
 	core_rq->core_forceidle_count      = rq->core_forceidle_count;
 	core_rq->core_forceidle_seq        = rq->core_forceidle_seq;
-	core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+	core_rq->core_sibidle_occupation   = rq->core_sibidle_occupation;
+	core_rq->core_sibidle_count        = rq->core_sibidle_count;
 
 	/*
 	 * Accounting edge for forced idle is handled in pick_next_task().
 	 * Don't need another one here, since the hotplug thread shouldn't
 	 * have a cookie.
 	 */
-	core_rq->core_forceidle_start = 0;
+	core_rq->core_sibidle_start = 0;
 
 	/* install new leader */
 	for_each_cpu(t, smt_mask) {
@@ -8305,8 +8320,9 @@ void __init sched_init(void)
 		rq->core_enabled = 0;
 		rq->core_tree = RB_ROOT;
 		rq->core_forceidle_count = 0;
-		rq->core_forceidle_occupation = 0;
-		rq->core_forceidle_start = 0;
+		rq->core_sibidle_count = 0;
+		rq->core_sibidle_occupation = 0;
+		rq->core_sibidle_start = 0;
 
 		rq->core_cookie = 0UL;
 #endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 18c5b298b481..19654cc5c113 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -304,7 +304,7 @@ int sysctl_sched_core_handler(struct ctl_table *table, int write,
 #ifdef CONFIG_SCHEDSTATS
 
 /* REQUIRES: rq->core's clock recently updated. */
-void __sched_core_account_forceidle(struct rq *rq)
+void __sched_core_account_sibidle(struct rq *rq)
 {
 	const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
 	u64 delta, now = rq_clock(rq->core);
@@ -314,28 +314,28 @@ void __sched_core_account_forceidle(struct rq *rq)
 
 	lockdep_assert_rq_held(rq);
 
-	WARN_ON_ONCE(!rq->core->core_forceidle_count);
+	WARN_ON_ONCE(!rq->core->core_sibidle_count);
 
-	if (rq->core->core_forceidle_start == 0)
-		return;
+	if (rq->core->core_sibidle_start == 0)
+		goto out;
 
-	delta = now - rq->core->core_forceidle_start;
+	delta = now - rq->core->core_sibidle_start;
 	if (unlikely((s64)delta <= 0))
-		return;
+		goto out;
 
-	rq->core->core_forceidle_start = now;
+	rq->core->core_sibidle_start = now;
 
-	if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+	if (WARN_ON_ONCE(!rq->core->core_sibidle_occupation)) {
 		/* can't be forced idle without a running task */
-	} else if (rq->core->core_forceidle_count > 1 ||
-		   rq->core->core_forceidle_occupation > 1) {
+	} else if (rq->core->core_sibidle_count > 1 ||
+		   rq->core->core_sibidle_occupation > 1) {
 		/*
 		 * For larger SMT configurations, we need to scale the charged
 		 * forced idle amount since there can be more than one forced
 		 * idle sibling and more than one running cookied task.
 		 */
-		delta *= rq->core->core_forceidle_count;
-		delta = div_u64(delta, rq->core->core_forceidle_occupation);
+		delta *= rq->core->core_sibidle_count;
+		delta = div_u64(delta, rq->core->core_sibidle_occupation);
 	}
 
 	for_each_cpu(i, smt_mask) {
@@ -349,19 +349,27 @@ void __sched_core_account_forceidle(struct rq *rq)
 		 * Note: this will account forceidle to the current cpu, even
 		 * if it comes from our SMT sibling.
 		 */
-		__account_forceidle_time(p, delta);
+		__account_sibidle_time(p, delta, !!rq->core->core_forceidle_count);
+	}
+
+out:
+#ifdef CONFIG_SCHED_ACPU
+	for_each_cpu(i, smt_mask) {
+		rq_i = cpu_rq(i);
+		rq->last_acpu_update_time = now;
 	}
+#endif
 }
 
 void __sched_core_tick(struct rq *rq)
 {
-	if (!rq->core->core_forceidle_count)
+	if (!rq->core->core_sibidle_count)
 		return;
 
 	if (rq != rq->core)
 		update_rq_clock(rq->core);
 
-	__sched_core_account_forceidle(rq);
+	__sched_core_account_sibidle(rq);
 }
 
 #endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 56b446412467..8c7221be42db 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -225,25 +225,20 @@ void account_idle_time(u64 cputime)
 }
 
 
-#ifdef CONFIG_SCHED_CORE
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_ACPU)
 /*
- * Account for forceidle time due to core scheduling.
+ * Account for sibidle, and for forceidle time due to core scheduling.
  *
  * REQUIRES: schedstat is enabled.
  */
-void __account_forceidle_time(struct task_struct *p, u64 delta)
-{
-	__schedstat_add(p->se.statistics.core_forceidle_sum, delta);
-
-	task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
-}
-#endif
-#ifdef CONFIG_SCHED_ACPU
-void __account_sibidle_time(struct task_struct *p, u64 delta)
+void __account_sibidle_time(struct task_struct *p, u64 delta, bool fi)
 {
 	__schedstat_add(p->se.statistics.core_sibidle_sum, delta);
-
 	task_group_account_field(p, CPUTIME_SIBIDLE, delta);
+	if (fi) {
+		__schedstat_add(p->se.statistics.core_forceidle_sum, delta);
+		task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+	}
 }
 #endif
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c27595620fb6..a8a13adbf7af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1307,15 +1307,11 @@ struct rq {
 	unsigned long		core_cookie;
 	unsigned int		core_forceidle_count;
 	unsigned int		core_forceidle_seq;
-	unsigned int		core_forceidle_occupation;
-	u64			core_forceidle_start;
+	unsigned int		core_sibidle_occupation;
+	u64			core_sibidle_start;
 	unsigned int		core_id;
-	unsigned int		core_realidle_count;
-	unsigned int		core_realidle_occupation;
-	u64			core_realidle_start;
-	u64			rq_realidle_time;
+	unsigned int		core_sibidle_count;
 	bool			in_forceidle;
-	bool			in_realidle;
 	struct task_struct	*force_idled_core_pick;
 #endif
 
@@ -2102,12 +2098,12 @@ static inline void flush_smp_call_function_from_idle(void) { }
 
 #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
 
-extern void __sched_core_account_forceidle(struct rq *rq);
+extern void __sched_core_account_sibidle(struct rq *rq);
 
-static inline void sched_core_account_forceidle(struct rq *rq)
+static inline void sched_core_account_sibidle(struct rq *rq)
 {
 	if (schedstat_enabled())
-		__sched_core_account_forceidle(rq);
+		__sched_core_account_sibidle(rq);
 }
 
 extern void __sched_core_tick(struct rq *rq);
@@ -2120,7 +2116,7 @@ static inline void sched_core_tick(struct rq *rq)
 
 #else
 
-static inline void sched_core_account_forceidle(struct rq *rq) {}
+static inline void sched_core_account_sibidle(struct rq *rq) {}
 
 static inline void sched_core_tick(struct rq *rq) {}
 
-- 
Gitee


From e615948eabcd632b5f4028547790afc1a5fc5663 Mon Sep 17 00:00:00 2001
From: Cruz Zhao <CruzZhao@linux.alibaba.com>
Date: Mon, 18 Sep 2023 10:24:20 +0800
Subject: [PATCH 4/6] anolis: sched: introduce per cgroup sibidle

accounting

ANBZ: #6729

This patch extends per task sibidle accounting into cgroups. rstat is
used for cgroup accounting, except for the root, which uses kcpustat
in order to bypass the need for doing an rstat flush when reading
root stats.

Data is displayed via /sys/fs/cgroup/cpu/<cgroup>/cpu.stat, row
sibidle_sum.

Similar to the task accounting, the cgroup accounting requires that
schedstats is enabled.

Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup/rstat.c       | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 24884a4626c9..d702ed7d2268 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -323,6 +323,9 @@ struct cgroup_base_stat {
 #ifdef CONFIG_SCHED_CORE
 	u64 forceidle_sum;
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	u64 sibidle_sum;
+#endif
 };
 
 /*
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 80bf45524200..cba6fdc3545d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -302,6 +302,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 #ifdef CONFIG_SCHED_CORE
 	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	dst_bstat->sibidle_sum += src_bstat->sibidle_sum;
+#endif
 }
 
 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -313,6 +316,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 #ifdef CONFIG_SCHED_CORE
 	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	dst_bstat->sibidle_sum -= src_bstat->sibidle_sum;
+#endif
 }
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -391,6 +397,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	case CPUTIME_FORCEIDLE:
 		rstatc->bstat.forceidle_sum += delta_exec;
 		break;
+#endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	case CPUTIME_SIBIDLE:
+		rstatc->bstat.sibidle_sum += delta_exec;
+		break;
 #endif
 	default:
 		break;
@@ -434,6 +445,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
 
 #ifdef CONFIG_SCHED_CORE
 		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+		bstat->sibidle_sum += cpustat[CPUTIME_SIBIDLE];
 #endif
 	}
 }
@@ -446,6 +460,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 #ifdef CONFIG_SCHED_CORE
 	u64 forceidle_time;
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	u64 sibidle_time;
+#endif
 
 	if (cgroup_parent(cgrp)) {
 		cgroup_rstat_flush_hold(cgrp);
@@ -454,6 +471,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 			       &utime, &stime);
 #ifdef CONFIG_SCHED_CORE
 		forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+		sibidle_time = cgrp->bstat.sibidle_sum;
 #endif
 		cgroup_rstat_flush_release();
 	} else {
@@ -463,6 +483,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 		stime = bstat.cputime.stime;
 #ifdef CONFIG_SCHED_CORE
 		forceidle_time = bstat.forceidle_sum;
+#endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+		sibidle_time = bstat.sibidle_sum;
 #endif
 	}
 
@@ -472,6 +495,9 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 #ifdef CONFIG_SCHED_CORE
 	do_div(forceidle_time, NSEC_PER_USEC);
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	do_div(sibidle_time, NSEC_PER_USEC);
+#endif
 
 	seq_printf(seq, "usage_usec %llu\n"
 		   "user_usec %llu\n"
@@ -481,4 +507,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 #ifdef CONFIG_SCHED_CORE
 	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
 #endif
+#if defined(CONFIG_SCHED_ACPU) || defined(CONFIG_SCHED_CORE)
+	seq_printf(seq, "sibidle_usec %llu\n", sibidle_time);
+#endif
 }
-- 
Gitee


From e815f6e37203b469c3129818443f3e226bfe34ac Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 29 Apr 2021 22:56:20 -0700
Subject: [PATCH 5/6] cgroup: rstat: support cgroup1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #6729

commit a7df69b81aac5bdeb5c5aef9addd680ce22feebf upstream.

Rstat currently only supports the default hierarchy in cgroup2.  In
order to replace memcg's private stats infrastructure - used in both
cgroup1 and cgroup2 - with rstat, the latter needs to support cgroup1.

The initialization and destruction callbacks for regular cgroups are
already in place.  Remove the cgroup_on_dfl() guards to handle cgroup1.

The initialization of the root cgroup is currently hardcoded to only
handle cgrp_dfl_root.cgrp.  Move those callbacks to cgroup_setup_root()
and cgroup_destroy_root() to handle the default root as well as the
various cgroup1 roots we may set up during mounting.

The linking of css to cgroups happens in code shared between cgroup1 and
cgroup2 as well.  Simply remove the cgroup_on_dfl() guard.

Linkage of the root css to the root cgroup is a bit trickier: per
default, the root css of a subsystem controller belongs to the default
hierarchy (i.e.  the cgroup2 root).  When a controller is mounted in its
cgroup1 version, the root css is stolen and moved to the cgroup1 root;
on unmount, the css moves back to the default hierarchy.  Annotate
rebind_subsystems() to move the root css linkage along between roots.

Link: https://lkml.kernel.org/r/20210209163304.77088-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 kernel/cgroup/cgroup.c | 34 +++++++++++++++++++++-------------
 kernel/cgroup/rstat.c  |  2 --
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5b1fdfd2fd4c..7fb89841ab57 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1351,6 +1351,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 
 	mutex_unlock(&cgroup_mutex);
 
+	cgroup_rstat_exit(cgrp);
 	kernfs_destroy_root(root->kf_root);
 	cgroup_free_root(root);
 }
@@ -1786,6 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 				       &dcgrp->e_csets[ss->id]);
 		spin_unlock_irq(&css_set_lock);
 
+		if (ss->css_rstat_flush) {
+			list_del_rcu(&css->rstat_css_node);
+			list_add_rcu(&css->rstat_css_node,
+				     &dcgrp->rstat_css_list);
+		}
+
 		/* default hierarchy doesn't enable controllers by default */
 		dst_root->subsys_mask |= 1 << ssid;
 		if (dst_root == &cgrp_dfl_root) {
@@ -2010,10 +2017,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	if (ret)
 		goto destroy_root;
 
-	ret = rebind_subsystems(root, ss_mask);
+	ret = cgroup_rstat_init(root_cgrp);
 	if (ret)
 		goto destroy_root;
 
+	ret = rebind_subsystems(root, ss_mask);
+	if (ret)
+		goto exit_stats;
+
 	ret = cgroup_bpf_inherit(root_cgrp);
 	WARN_ON_ONCE(ret);
 
@@ -2045,6 +2056,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	ret = 0;
 	goto out;
 
+exit_stats:
+	cgroup_rstat_exit(root_cgrp);
 destroy_root:
 	kernfs_destroy_root(root->kf_root);
 	root->kf_root = NULL;
@@ -5119,8 +5132,7 @@ static void css_free_rwork_fn(struct work_struct *work)
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			psi_cgroup_free(cgrp);
-			if (cgroup_on_dfl(cgrp))
-				cgroup_rstat_exit(cgrp);
+			cgroup_rstat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -5161,8 +5173,7 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		TRACE_CGROUP_PATH(release, cgrp);
 
-		if (cgroup_on_dfl(cgrp))
-			cgroup_rstat_flush(cgrp);
+		cgroup_rstat_flush(cgrp);
 
 		spin_lock_irq(&css_set_lock);
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5219,7 +5230,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 		css_get(css->parent);
 	}
 
-	if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+	if (ss->css_rstat_flush)
 		list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
 
 	BUG_ON(cgroup_css(cgrp, ss));
@@ -5353,11 +5364,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	if (ret)
 		goto out_free_cgrp;
 
-	if (cgroup_on_dfl(parent)) {
-		ret = cgroup_rstat_init(cgrp);
-		if (ret)
-			goto out_cancel_ref;
-	}
+	ret = cgroup_rstat_init(cgrp);
+	if (ret)
+		goto out_cancel_ref;
 
 	/* create the directory */
 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
@@ -5444,8 +5453,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 out_kernfs_remove:
 	kernfs_remove(cgrp->kn);
 out_stat_exit:
-	if (cgroup_on_dfl(parent))
-		cgroup_rstat_exit(cgrp);
+	cgroup_rstat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cba6fdc3545d..ab83d654b7df 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -285,8 +285,6 @@ void __init cgroup_rstat_boot(void)
 
 	for_each_possible_cpu(cpu)
 		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
-
-	BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
 }
 
 /*
-- 
Gitee


From 76730757c42b4b1d6a52667787eed373f172c8fe Mon Sep 17 00:00:00 2001
From: Cruz Zhao <CruzZhao@linux.alibaba.com>
Date: Tue, 28 Feb 2023 09:42:36 +0000
Subject: [PATCH 6/6] anolis: sched/core: introduce ht-aware-quota

ANBZ: #6729

With acpu accounting, we are able to assess how long the task is running
with sibling idle and how long with sibling busy. To make the computing
power of tasks stable, we need the tasks to execute a similar number of
instructions in each scheduling cycle.

To achieve this goal, we introduce ht-aware-quota. When task is running
with sibling idle, we consider the task to have execute more
instructions, with a certain ratio, and the sibling idle time * ratio
will be account its cfs_rq_runtime, not just siblind idle time. The
ratio can be configured from /sys/fs/cgroup/<cgroup>/cpu.ht_ratio,
unit: percentage, range: [100, 200], default: 100.

As for now, ht-aware-quota is only valid for cookie'd tasks, as when the
sibling is busy, we know what task is running. And sched_feat
SCHED_CORE_HT_AWARE_QUOTA is required to be enable.

Signed-off-by: Cruz Zhao <CruzZhao@linux.alibaba.com>
---
 include/linux/sched.h     |  2 +-
 kernel/sched/core.c       | 49 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/core_sched.c |  1 +
 kernel/sched/fair.c       | 22 ++++++++++++++++++
 kernel/sched/features.h   |  4 ++++
 kernel/sched/sched.h      |  4 ++++
 6 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9dcfe12be015..3dad0518a36b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -579,7 +579,7 @@ struct sched_entity {
 
 #ifdef CONFIG_SCHED_CORE
 	u64				core_vruntime;
-	unsigned int			ht_aware_quota_coefficient;
+	unsigned int			ht_ratio;
 #endif
 
 	CK_KABI_RESERVE(1)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f803bef95640..f0d6fa895d03 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8604,6 +8604,9 @@ struct task_group *sched_create_group(struct task_group *parent)
 
 	alloc_uclamp_sched_group(tg, parent);
 
+#ifdef CONFIG_SCHED_CORE
+	tg->ht_ratio = 100;
+#endif
 	return tg;
 
 err:
@@ -9534,6 +9537,38 @@ static s64 cpu_identity_read_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+static int cpu_ht_ratio_write(struct cgroup_subsys_state *css,
+			      struct cftype *cftype, u64 ht_ratio)
+{
+	struct task_group *tg = css_tg(css);
+	int cpu;
+
+	if (ht_ratio < 100 || ht_ratio > 200)
+		return -1;
+
+	if (tg == &root_task_group)
+		return -1;
+
+	tg->ht_ratio = ht_ratio;
+	for_each_online_cpu(cpu) {
+		struct sched_entity *se = tg->se[cpu];
+
+		se->ht_ratio = ht_ratio;
+	}
+
+	return 0;
+}
+
+static u64 cpu_ht_ratio_read(struct cgroup_subsys_state *css,
+					       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->ht_ratio;
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -9641,6 +9676,13 @@ static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_ht_stable_read_u64,
 		.write_u64 = cpu_ht_stable_write_u64,
 	},
+#endif
+#ifdef CONFIG_SCHED_CORE
+	{
+		.name = "ht_ratio",
+		.read_u64 = cpu_ht_ratio_read,
+		.write_u64 = cpu_ht_ratio_write,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -10204,6 +10246,13 @@ static struct cftype cpu_files[] = {
 		.write_u64 = sched_lat_stat_write,
 		.seq_show = sched_lat_stat_show
 	},
+#endif
+#ifdef CONFIG_SCHED_CORE
+	{
+		.name = "ht_ratio",
+		.read_u64 = cpu_ht_ratio_read,
+		.write_u64 = cpu_ht_ratio_write,
+	},
 #endif
 	{ }	/* terminate */
 };
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 19654cc5c113..bc1f23bc14c5 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -350,6 +350,7 @@ void __sched_core_account_sibidle(struct rq *rq)
 		 * if it comes from our SMT sibling.
 		 */
 		__account_sibidle_time(p, delta, !!rq->core->core_forceidle_count);
+		account_ht_aware_quota(p, delta);
 	}
 
 out:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 719c6e071d25..61d0c89a1f2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13015,6 +13015,25 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
 #endif
 	return throttled_hierarchy(cfs_rq);
 }
+
+void account_ht_aware_quota(struct task_struct *p, u64 delta)
+{
+	struct sched_entity *se;
+	unsigned int ht_ratio;
+	struct cfs_rq *cfs_rq;
+
+	/* We only account ht_aware_quota for cookied task. */
+	if (sched_feat(SCHED_CORE_HT_AWARE_QUOTA) && p->core_cookie) {
+		se = &p->se;
+		cfs_rq = task_cfs_rq(p);
+
+		if (se->parent) {
+			ht_ratio = se->parent->ht_ratio;
+			if (ht_ratio >= 100 && ht_ratio <= 200)
+				account_cfs_rq_runtime(cfs_rq, delta * (ht_ratio - 100) / 100);
+		}
+	}
+}
 #else
 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
 #endif
@@ -13448,6 +13467,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+#ifdef CONFIG_SCHED_CORE
+		se->ht_ratio = 100;
+#endif
 	}
 
 	return 1;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 79f43319ac79..1163f14f24a0 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -106,3 +106,7 @@ SCHED_FEAT(ID_LAST_HIGHCLASS_STAY, false)
 SCHED_FEAT(ID_LOOSE_EXPEL, false)
 SCHED_FEAT(ID_EXPELLER_SHARE_CORE, true)
 #endif
+
+#ifdef CONFIG_SCHED_CORE
+SCHED_FEAT(SCHED_CORE_HT_AWARE_QUOTA, false)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a8a13adbf7af..bd65a2707cbc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -545,6 +545,9 @@ struct task_group {
 	struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	unsigned int		ht_ratio;
+#endif
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -1485,6 +1488,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 extern void sched_core_get(void);
 extern void sched_core_put(void);
 
+extern void account_ht_aware_quota(struct task_struct *p, u64 delta);
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)
-- 
Gitee