From 493f855d8a63755dc327faa0204ae4a8b757b340 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:26 +0000
Subject: [PATCH 1/8] sched: Introduce qos smt expeller for co-location

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
CVE: NA

--------------------------------

We introduce the qos smt expeller, which lets
online tasks to expel offline tasks on the smt sibling cpus,
and exclusively occupy CPU resources.In this way we are
able to improve QOS of online tasks in co-location.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 init/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 87941b608911..f57943f7b6d0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1001,6 +1001,15 @@ config QOS_SCHED
 
 	  If in doubt, say N.
 
+config QOS_SCHED_SMT_EXPELLER
+	bool "Qos smt expeller"
+	depends on SCHED_SMT
+	depends on QOS_SCHED
+	default n
+	help
+	  This feature enable online tasks to expel offline tasks
+	  on the smt sibling cpus, and exclusively occupy CPU resources.
+
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
-- 
Gitee


From 37d0f2389d0cbc549bf64348d647d4f24ccb3a3f Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:27 +0000
Subject: [PATCH 2/8] sched: Implement the function of qos smt expeller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
CVE: NA

--------------------------------

We implement the function of qos smt expeller by this following
two points：
a) when online tasks and offline tasks are running on the same
physical cpu, online tasks will send ipi to expel offline tasks
on the smt sibling cpus.
b) when online tasks are running, the smt sibling cpus will not
allow offline tasks to be selected.

Adapted to openEuler-6.4.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 include/linux/sched.h |   7 ++
 kernel/sched/fair.c   | 189 +++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h  |   5 ++
 3 files changed, 199 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 73ddff132493..b97ca0a9fd08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2012,9 +2012,16 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
 	__get_task_comm(buf, sizeof(buf), tsk);		\
 })
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+void qos_smt_check_need_resched(void);
+#endif
+
 #ifdef CONFIG_SMP
 static __always_inline void scheduler_ipi(void)
 {
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_smt_check_need_resched();
+#endif
 	/*
 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d9af04551788..3626c6af3556 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -61,6 +61,10 @@
 #include <linux/resume_user_mode.h>
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+#include <trace/events/ipi.h>
+#endif
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -190,6 +194,10 @@ unsigned int sysctl_offline_wait_interval = 100;  /* in ms */
 static int unthrottle_qos_cfs_rqs(int cpu);
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+static DEFINE_PER_CPU(int, qos_smt_status);
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -8523,6 +8531,131 @@ static void qos_schedule_throttle(struct task_struct *p)
 
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+static bool qos_smt_check_siblings_status(int this_cpu)
+{
+	int cpu;
+
+	if (!sched_smt_active())
+		return false;
+
+	for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
+		if (cpu == this_cpu)
+			continue;
+
+		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE)
+			return true;
+	}
+
+	return false;
+}
+
+static bool qos_smt_expelled(int this_cpu)
+{
+	/*
+	 * The qos_smt_status of siblings cpu is online, and current cpu only has
+	 * offline tasks enqueued, there is not suitable task,
+	 * so pick_next_task_fair return null.
+	 */
+	if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu))
+		return true;
+
+	return false;
+}
+
+static bool qos_smt_update_status(struct task_struct *p)
+{
+	int status = QOS_LEVEL_OFFLINE;
+
+	if (p != NULL && task_group(p)->qos_level >= QOS_LEVEL_ONLINE)
+		status = QOS_LEVEL_ONLINE;
+
+	if (__this_cpu_read(qos_smt_status) == status)
+		return false;
+
+	__this_cpu_write(qos_smt_status, status);
+
+	return true;
+}
+
+static void qos_smt_send_ipi(int this_cpu)
+{
+	int cpu;
+	struct rq *rq = NULL;
+
+	if (!sched_smt_active())
+		return;
+
+	for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
+		if (cpu == this_cpu)
+			continue;
+
+		rq = cpu_rq(cpu);
+
+		/*
+		* There are two cases where current don't need to send scheduler_ipi:
+		* a) The qos_smt_status of siblings cpu is online;
+		* b) The cfs.h_nr_running of siblings cpu is 0.
+		*/
+		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE ||
+		    rq->cfs.h_nr_running == 0)
+			continue;
+
+		smp_send_reschedule(cpu);
+	}
+}
+
+static void qos_smt_expel(int this_cpu, struct task_struct *p)
+{
+	if (qos_smt_update_status(p))
+		qos_smt_send_ipi(this_cpu);
+}
+
+static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
+{
+	int cpu;
+
+	if (!sched_smt_active())
+		return false;
+
+	for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
+		if (cpu == this_cpu)
+			continue;
+
+		/*
+		* There are two cases rely on the set need_resched to drive away
+		* offline task：
+		* a) The qos_smt_status of siblings cpu is online, the task of current cpu is offline;
+		* b) The qos_smt_status of siblings cpu is offline, the task of current cpu is idle,
+		*    and current cpu only has SCHED_IDLE tasks enqueued.
+		*/
+		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE &&
+		    task_group(current)->qos_level < QOS_LEVEL_ONLINE)
+			return true;
+
+		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE &&
+		    rq->curr == rq->idle && sched_idle_cpu(this_cpu))
+			return true;
+	}
+
+	return false;
+}
+
+void qos_smt_check_need_resched(void)
+{
+	struct rq *rq = this_rq();
+	int this_cpu = rq->cpu;
+
+	if (test_tsk_need_resched(current))
+		return;
+
+	if (_qos_smt_check_need_resched(this_cpu, rq)) {
+		set_tsk_need_resched(current);
+		set_preempt_need_resched();
+	}
+}
+#endif
+
 #ifdef CONFIG_SMP
 static struct task_struct *pick_task_fair(struct rq *rq)
 {
@@ -8563,14 +8696,30 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	struct sched_entity *se;
 	struct task_struct *p;
 	int new_tasks;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	int this_cpu = rq->cpu;
+#endif
 
 again:
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	if (qos_smt_expelled(this_cpu)) {
+		__this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE);
+		return NULL;
+	}
+#endif
+
 	if (!sched_fair_runnable(rq))
 		goto idle;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (!prev || prev->sched_class != &fair_sched_class)
-		goto simple;
+	if (!prev || prev->sched_class != &fair_sched_class) {
+#ifdef CONFIG_QOS_SCHED
+		if (cfs_rq->idle_h_nr_running != 0 && rq->online)
+			goto qos_simple;
+		else
+#endif
+			goto simple;
+	}
 
 	/*
 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
@@ -8654,6 +8803,34 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	}
 
 	goto done;
+
+#ifdef CONFIG_QOS_SCHED
+qos_simple:
+	if (prev)
+		put_prev_task(rq, prev);
+
+	do {
+		se = pick_next_entity(cfs_rq, NULL);
+		if (check_qos_cfs_rq(group_cfs_rq(se))) {
+			cfs_rq = &rq->cfs;
+			if (!cfs_rq->nr_running)
+				goto idle;
+			continue;
+		}
+
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
+
+	p = task_of(se);
+
+	while (se) {
+		set_next_entity(cfs_rq_of(se), se);
+		se = parent_entity(se);
+	}
+
+	goto done;
+#endif
+
 simple:
 #endif
 	if (prev)
@@ -8686,6 +8863,10 @@ done: __maybe_unused;
 	qos_schedule_throttle(p);
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_smt_expel(this_cpu, p);
+#endif
+
 	return p;
 
 idle:
@@ -8719,6 +8900,10 @@ done: __maybe_unused;
 	 */
 	update_idle_rq_clock_pelt(rq);
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_smt_expel(this_cpu, NULL);
+#endif
+
 	return NULL;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f4e65a5e3009..19069d2b050d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1412,6 +1412,11 @@ do {						\
 } while (0)
 
 #ifdef CONFIG_QOS_SCHED
+enum task_qos_level {
+	QOS_LEVEL_OFFLINE = -1,
+	QOS_LEVEL_ONLINE = 0,
+	QOS_LEVEL_MAX
+};
 void init_qos_hrtimer(int cpu);
 #endif
 
-- 
Gitee


From 66bf52e07235bb5736c410e89246bb61bae4ed29 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:28 +0000
Subject: [PATCH 3/8] sched: Add statistics for qos smt expeller

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
CVE: NA

--------------------------------

We have added two statistics for qos smt expeller:
a) nr_qos_smt_send_ipi:the times of ipi which
online task expel offline tasks;
b) nr_qos_smt_expelled:the statistics that offline
task will not be picked times.

Adapted to 6.4 kernel.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 include/linux/sched.h | 6 ++++++
 kernel/sched/debug.c  | 6 ++++++
 kernel/sched/fair.c   | 2 ++
 3 files changed, 14 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b97ca0a9fd08..86278255c494 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,6 +549,12 @@ struct sched_statistics {
 	u64				nr_wakeups_preferred_cpus;
 	u64				nr_wakeups_force_preferred_cpus;
 #endif
+
+#if defined(CONFIG_QOS_SCHED_SMT_EXPELLER) && !defined(__GENKSYMS__)
+	u64				nr_qos_smt_send_ipi;
+	u64				nr_qos_smt_expelled;
+#endif
+
 #endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 940e191d7722..37e71fdd879d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1049,6 +1049,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		}
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		P_SCHEDSTAT(nr_qos_smt_send_ipi);
+		P_SCHEDSTAT(nr_qos_smt_expelled);
+#endif
+
+
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3626c6af3556..603625e1c974 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8601,6 +8601,7 @@ static void qos_smt_send_ipi(int this_cpu)
 		    rq->cfs.h_nr_running == 0)
 			continue;
 
+		schedstat_inc(current->stats.nr_qos_smt_send_ipi);
 		smp_send_reschedule(cpu);
 	}
 }
@@ -8704,6 +8705,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
 	if (qos_smt_expelled(this_cpu)) {
 		__this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE);
+		schedstat_inc(rq->curr->stats.nr_qos_smt_expelled);
 		return NULL;
 	}
 #endif
-- 
Gitee


From f6f61e7c1adaea99aa6862e2a9357bc7f2138bd5 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:29 +0000
Subject: [PATCH 4/8] sched: Add tracepoint for qos smt expeller

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR
CVE: NA

--------------------------------

There are two caces that we add tracepoint:
a) while online task of sibling cpu is running, it is running
that offline task of local cpu will be set TIF_NEED_RESCHED;
b) while online task of sibling cpu is running, it will expell
that next picked offline task of local cpu.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 include/trace/events/sched.h | 55 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  9 ++++--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..4bafb70dfafc 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -268,6 +268,61 @@ TRACE_EVENT(sched_switch,
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+/*
+ * Tracepoint for a offline task being resched:
+ */
+TRACE_EVENT(sched_qos_smt_expel,
+
+	TP_PROTO(struct task_struct *sibling_p, int qos_smt_status),
+
+	TP_ARGS(sibling_p, qos_smt_status),
+
+	TP_STRUCT__entry(
+		__array(	char,	sibling_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	sibling_pid			)
+		__field(	int,	sibling_qos_status		)
+		__field(	int,	sibling_cpu			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->sibling_comm, sibling_p->comm, TASK_COMM_LEN);
+		__entry->sibling_pid		= sibling_p->pid;
+		__entry->sibling_qos_status	= qos_smt_status;
+		__entry->sibling_cpu		= task_cpu(sibling_p);
+	),
+
+	TP_printk("sibling_comm=%s sibling_pid=%d sibling_qos_status=%d sibling_cpu=%d",
+		  __entry->sibling_comm, __entry->sibling_pid, __entry->sibling_qos_status,
+		  __entry->sibling_cpu)
+);
+
+/*
+ * Tracepoint for a offline task being expelled:
+ */
+TRACE_EVENT(sched_qos_smt_expelled,
+
+	TP_PROTO(struct task_struct *p, int qos_smt_status),
+
+	TP_ARGS(p, qos_smt_status),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	qos_status		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->qos_status	= qos_smt_status;
+	),
+
+	TP_printk("comm=%s pid=%d qos_status=%d",
+		  __entry->comm, __entry->pid, __entry->qos_status)
+);
+#endif
+
 /*
  * Tracepoint for a task being migrated:
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 603625e1c974..799ffa8eea89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8631,12 +8631,16 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
 		*    and current cpu only has SCHED_IDLE tasks enqueued.
 		*/
 		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE &&
-		    task_group(current)->qos_level < QOS_LEVEL_ONLINE)
+		    task_group(current)->qos_level < QOS_LEVEL_ONLINE) {
+			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
 			return true;
+		}
 
 		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE &&
-		    rq->curr == rq->idle && sched_idle_cpu(this_cpu))
+		    rq->curr == rq->idle && sched_idle_cpu(this_cpu)) {
+			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
 			return true;
+		}
 	}
 
 	return false;
@@ -8706,6 +8710,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	if (qos_smt_expelled(this_cpu)) {
 		__this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE);
 		schedstat_inc(rq->curr->stats.nr_qos_smt_expelled);
+		trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu));
 		return NULL;
 	}
 #endif
-- 
Gitee


From 8c78eeca4574ce0a2391acff7325454367985e39 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:30 +0000
Subject: [PATCH 5/8] config: enable CONFIG_QOS_SCHED_SMT_EXPELLER by

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I52611
CVE: NA

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 arch/arm64/configs/openeuler_defconfig | 1 +
 arch/x86/configs/openeuler_defconfig   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index d187c7c6b84b..c1079475eb47 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -163,6 +163,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_QOS_SCHED_SMT_EXPELLER=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SCHED_MM_CID=y
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 9c216e1104ef..2ce540ce01ec 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -186,6 +186,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_QOS_SCHED_SMT_EXPELLER=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_QOS_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
-- 
Gitee


From 064fcd12ef369d0beeb7aa5c09d156819f472c7b Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:32 +0000
Subject: [PATCH 6/8] sched/fair: Start tracking qos_offline tasks count in
 cfs_rq

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR

-------------------------------

Track how many tasks are present with qos_offline_policy
in each cfs_rq. This will be used by later commits.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 kernel/sched/fair.c  | 82 +++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h | 22 ++++++++++++
 2 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 799ffa8eea89..cf439444b073 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5461,6 +5461,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, idle_task_delta, dequeue = 1;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	long qos_idle_delta;
+#endif
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5492,6 +5495,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
+#endif
+
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
@@ -5505,6 +5512,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
+#endif
 
 		if (qcfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -5527,6 +5537,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
+#endif
 	}
 
 	/* At this point se is NULL and we are at root level*/
@@ -5548,6 +5561,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	long qos_idle_delta;
+#endif
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -5587,6 +5603,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
+#endif
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 
@@ -5599,6 +5618,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running += qos_idle_delta;
+#endif
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -5616,6 +5638,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running += task_delta;
 		qcfs_rq->idle_h_nr_running += idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running += qos_idle_delta;
+#endif
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(qcfs_rq))
@@ -6366,6 +6391,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	int idle_h_nr_running = task_has_idle_policy(p);
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	int qos_idle_h_nr_running = task_has_qos_idle_policy(p);
+#endif
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 
 	/*
@@ -6392,6 +6420,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running;
+#endif
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -6412,7 +6443,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
-
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running;
+#endif
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
 
@@ -6465,6 +6498,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	int idle_h_nr_running = task_has_idle_policy(p);
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	int qos_idle_h_nr_running = task_has_qos_idle_policy(p);
+#endif
 	bool was_sched_idle = sched_idle_rq(rq);
 
 	util_est_dequeue(&rq->cfs, p);
@@ -6475,6 +6511,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running--;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running;
+#endif
 
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
@@ -6507,7 +6546,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		cfs_rq->h_nr_running--;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
-
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running;
+#endif
 		if (cfs_rq_is_idle(cfs_rq))
 			idle_h_nr_running = 1;
 
@@ -8276,7 +8317,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *se;
 	long task_delta, idle_task_delta;
-
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	long qos_idle_delta;
+#endif
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
 	/* freeze hierarchy runnable averages while throttled */
@@ -8286,6 +8329,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
+#endif
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
@@ -8296,6 +8342,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
+#endif
 
 		if (qcfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -8318,6 +8367,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
+#endif
 	}
 
 	/* At this point se is NULL and we are at root level*/
@@ -8338,6 +8390,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *se;
 	long task_delta, idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	long qos_idle_delta;
+#endif
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -8370,6 +8425,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
+#endif
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
@@ -8379,6 +8437,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 		cfs_rq->h_nr_running += task_delta;
 		cfs_rq->idle_h_nr_running += idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
+#endif
 
 		if (cfs_rq_throttled(cfs_rq))
 			goto unthrottle_throttle;
@@ -8392,6 +8453,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 		cfs_rq->h_nr_running += task_delta;
 		cfs_rq->idle_h_nr_running += idle_task_delta;
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+		cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
+#endif
 
 		/* end evaluation on encountering a throttled cfs_rq */
 		if (cfs_rq_throttled(cfs_rq))
@@ -8550,6 +8614,14 @@ static bool qos_smt_check_siblings_status(int this_cpu)
 	return false;
 }
 
+static bool qos_sched_idle_cpu(int this_cpu)
+{
+	struct rq *rq = cpu_rq(this_cpu);
+
+	return unlikely(rq->nr_running == rq->cfs.qos_idle_h_nr_running &&
+			rq->nr_running);
+}
+
 static bool qos_smt_expelled(int this_cpu)
 {
 	/*
@@ -8557,7 +8629,7 @@ static bool qos_smt_expelled(int this_cpu)
 	 * offline tasks enqueued, there is not suitable task,
 	 * so pick_next_task_fair return null.
 	 */
-	if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu))
+	if (qos_smt_check_siblings_status(this_cpu) && qos_sched_idle_cpu(this_cpu))
 		return true;
 
 	return false;
@@ -8637,7 +8709,7 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
 		}
 
 		if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE &&
-		    rq->curr == rq->idle && sched_idle_cpu(this_cpu)) {
+		    rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) {
 			trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu));
 			return true;
 		}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 19069d2b050d..0d981063bf48 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -664,6 +664,14 @@ struct cfs_rq {
 #if defined(CONFIG_QOS_SCHED)
 	struct list_head	qos_throttled_list;
 #endif
+
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+	union {
+		unsigned int            qos_idle_h_nr_running; /* qos_level：-1 */
+		unsigned long           qos_idle_h_nr_running_padding;
+	};
+#endif
+
 };
 
 static inline int rt_bandwidth_enabled(void)
@@ -3256,6 +3264,20 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 }
 #endif
 
+#ifdef CONFIG_QOS_SCHED
+static inline int qos_idle_policy(int policy)
+{
+	return policy == QOS_LEVEL_OFFLINE;
+}
+#endif
+
+#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+static inline int task_has_qos_idle_policy(struct task_struct *p)
+{
+	return qos_idle_policy(task_group(p)->qos_level) && p->policy == SCHED_IDLE;
+}
+#endif
+
 extern void swake_up_all_locked(struct swait_queue_head *q);
 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
 
-- 
Gitee


From d8e7cc940fff12ba8a931febbb4b6d0eb2b0cc58 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:33 +0000
Subject: [PATCH 7/8] sched/fair: Introduce QOS_SMT_EXPELL priority reversion
 mechanism

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR

-------------------------------

Here is the typical case that priority inversion will caused occasionally
by SMT expelling: Assuming that there are two SMT cores-cA and cB,
online tasks are running on cA while offline tasks on cB. With SMT
expelling, online task will drives off offline tasks to occupy all SMT
cores exclusively, which, in turn, will starve the offline task to release
the related resources other tasks with higher priority need.

Hence, this patch will introduce another mechanism to alleviate this
situation. For all offline tasks, one metric of profiling the maximum task
expelling duration is set up and the default value is 5 seconds, if
such offline task exists, all offline tasks will be allowed to run into
one small sleep(msleep) loop in kernel before they goes into
usermode; and further, if the two SMT cores(such as cA and cB) are
idle or don't get any online tasks to run, for these offline tasks, they
will continue to run in usermode for the next schedule.

Adapt to 6.4 kernel.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf439444b073..4d174f21ceb8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -192,6 +192,7 @@ static DEFINE_PER_CPU(int, qos_cpu_overload);
 unsigned int sysctl_overload_detect_period = 5000;  /* in ms */
 unsigned int sysctl_offline_wait_interval = 100;  /* in ms */
 static int unthrottle_qos_cfs_rqs(int cpu);
+static bool qos_smt_expelled(int this_cpu);
 #endif
 
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
@@ -8304,6 +8305,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 }
 
 #ifdef CONFIG_QOS_SCHED
+static inline bool qos_timer_is_activated(int cpu)
+{
+	return hrtimer_active(per_cpu_ptr(&qos_overload_timer, cpu));
+}
+
+static inline void cancel_qos_timer(int cpu)
+{
+	hrtimer_cancel(per_cpu_ptr(&qos_overload_timer, cpu));
+}
 
 static inline bool is_offline_task(struct task_struct *p)
 {
@@ -8376,7 +8386,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	sub_nr_running(rq, task_delta);
 
 done:
-	if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq))))
+	if (!qos_timer_is_activated(cpu_of(rq)))
 		start_qos_hrtimer(cpu_of(rq));
 
 	cfs_rq->throttled = QOS_THROTTLED;
@@ -8467,10 +8477,6 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 unthrottle_throttle:
 
 	assert_list_leaf_cfs_rq(rq);
-
-	/* Determine whether we need to wake up potentially idle CPU: */
-	if (rq->curr == rq->idle && rq->cfs.nr_running)
-		resched_curr(rq);
 }
 
 static int __unthrottle_qos_cfs_rqs(int cpu)
@@ -8492,11 +8498,10 @@ static int __unthrottle_qos_cfs_rqs(int cpu)
 static int unthrottle_qos_cfs_rqs(int cpu)
 {
 	int res;
-
 	res = __unthrottle_qos_cfs_rqs(cpu);
-	if (res)
-		hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
 
+	if (qos_timer_is_activated(cpu) && !qos_smt_expelled(cpu))
+		cancel_qos_timer(cpu);
 	return res;
 }
 
@@ -8552,8 +8557,13 @@ static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer)
 	struct rq *rq = this_rq();
 
 	rq_lock_irqsave(rq, &rf);
-	if (__unthrottle_qos_cfs_rqs(smp_processor_id()))
-		__this_cpu_write(qos_cpu_overload, 1);
+	__unthrottle_qos_cfs_rqs(smp_processor_id());
+	__this_cpu_write(qos_cpu_overload, 1);
+
+	/* Determine whether we need to wake up potentially idle CPU. */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_curr(rq);
+
 	rq_unlock_irqrestore(rq, &rf);
 
 	return HRTIMER_NORESTART;
@@ -8593,6 +8603,13 @@ static void qos_schedule_throttle(struct task_struct *p)
 	}
 }
 
+#ifndef CONFIG_QOS_SCHED_SMT_EXPELLER
+static bool qos_smt_expelled(int this_cpu)
+{
+	return false;
+}
+#endif
+
 #endif
 
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
@@ -8779,8 +8796,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 again:
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
-	if (qos_smt_expelled(this_cpu)) {
+	if (qos_smt_expelled(this_cpu) && !__this_cpu_read(qos_cpu_overload)) {
 		__this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE);
+
+		if (!qos_timer_is_activated(this_cpu))
+			start_qos_hrtimer(this_cpu);
+
 		schedstat_inc(rq->curr->stats.nr_qos_smt_expelled);
 		trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu));
 		return NULL;
@@ -8971,7 +8992,8 @@ done: __maybe_unused;
 		goto again;
 	}
 
-	__this_cpu_write(qos_cpu_overload, 0);
+	if (!qos_smt_expelled(cpu_of(rq)))
+		__this_cpu_write(qos_cpu_overload, 0);
 #endif
 	/*
 	 * rq is about to be idle, check if we need to update the
-- 
Gitee


From 16c0701b6220c27793bc922c80312c9cf23072b3 Mon Sep 17 00:00:00 2001
From: Xia Fukun <xiafukun@huawei.com>
Date: Tue, 5 Sep 2023 10:58:34 +0000
Subject: [PATCH 8/8] sched/fair: Add cmdline nosmtexpell

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6SIY2

-------------------------------

Add cmdline nosmtexpell to disable qos_smt_expell when we want to close.

Signed-off-by: Guan Jing <guanjing6@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 kernel/sched/fair.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4d174f21ceb8..c5a6f72212d9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8613,6 +8613,15 @@ static bool qos_smt_expelled(int this_cpu)
 #endif
 
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
+DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch);
+
+static int __init qos_sched_smt_noexpell_setup(char *__unused)
+{
+	static_branch_disable(&qos_smt_expell_switch);
+	return 1;
+}
+__setup("nosmtexpell", qos_sched_smt_noexpell_setup);
+
 static bool qos_smt_check_siblings_status(int this_cpu)
 {
 	int cpu;
@@ -8641,6 +8650,9 @@ static bool qos_sched_idle_cpu(int this_cpu)
 
 static bool qos_smt_expelled(int this_cpu)
 {
+	if (!static_branch_likely(&qos_smt_expell_switch))
+		return false;
+
 	/*
 	 * The qos_smt_status of siblings cpu is online, and current cpu only has
 	 * offline tasks enqueued, there is not suitable task,
@@ -8697,15 +8709,29 @@ static void qos_smt_send_ipi(int this_cpu)
 
 static void qos_smt_expel(int this_cpu, struct task_struct *p)
 {
+	if (!static_branch_likely(&qos_smt_expell_switch))
+		return;
+
 	if (qos_smt_update_status(p))
 		qos_smt_send_ipi(this_cpu);
 }
 
+static inline bool qos_smt_enabled(void)
+{
+	if (!static_branch_likely(&qos_smt_expell_switch))
+		return false;
+
+	if (!sched_smt_active())
+		return false;
+
+	return true;
+}
+
 static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq)
 {
 	int cpu;
 
-	if (!sched_smt_active())
+	if (!qos_smt_enabled())
 		return false;
 
 	for_each_cpu(cpu, cpu_smt_mask(this_cpu)) {
-- 
Gitee