From e1a9be8cba30ae8674db3e138b3ee8217c96eb5e Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:48 +0800
Subject: [PATCH 01/15] sched: Introduce qos scheduler for co-location

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

In cloud-native hybrid deployment scenarios, online tasks must preempt
offline tasks in a timely and offline tasks can't affect the QoS of online
tasks, so we introduce the idea of qos level to scheduler, which now is
supported with different scheduler policies. The qos scheduler will change
the policy of correlative tasks when the qos level of a task group is
modified with cpu.qos_level cpu cgroup file. In this way we are able to
satisfy different needs of tasks in different qos levels.

The value of qos_level can be 0 or -1, default value is 0. If qos_level
is 0, the group is an online group. otherwise it is an offline group.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 init/Kconfig         |  12 +++++
 kernel/sched/core.c  | 116 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   4 ++
 3 files changed, 132 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 7a3299a632e0..d96c76143610 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -984,6 +984,18 @@ menuconfig CGROUP_SCHED
 	  tasks.
 
 if CGROUP_SCHED
+config QOS_SCHED
+	bool "Qos task scheduling"
+	depends on CGROUP_SCHED
+	depends on CFS_BANDWIDTH
+	default n
+	help
+	  This option enable qos scheduler, and support co-location online
+	  services (Latency Sensitive) and offline tasks. colocation can
+	  effectively improve the resource utilization.
+
+	  If in doubt, say N.
+
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89ac3c89e39e..ac7abb6c5c64 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7680,6 +7680,18 @@ static int __sched_setscheduler(struct task_struct *p,
 	}
 change:
 
+#ifdef CONFIG_QOS_SCHED
+	/*
+	 * If the scheduling policy of an offline task is set to a policy
+	 * other than SCHED_IDLE, the online task preemption and cpu resource
+	 * isolation will be invalid, so return -EINVAL in this case.
+	 */
+	if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) {
+		task_rq_unlock(rq, p, &rf);
+		return -EINVAL;
+	}
+#endif
+
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		/*
@@ -10308,6 +10320,35 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
+#ifdef CONFIG_QOS_SCHED
+static inline int alloc_qos_sched_group(struct task_group *tg,
+					struct task_group *parent)
+{
+	tg->qos_level = parent->qos_level;
+
+	return 1;
+}
+
+static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg)
+{
+	struct sched_attr attr = {0};
+
+	/*
+	 * No need to re-setcheduler when a task is exiting or the task
+	 * is in an autogroup.
+	 */
+	if (!(tsk->flags & PF_EXITING) &&
+	    !task_group_is_autogroup(tg) &&
+	    (tg->qos_level == -1)) {
+		attr.sched_priority = 0;
+		attr.sched_policy = SCHED_IDLE;
+		attr.sched_nice = PRIO_TO_NICE(tsk->static_prio);
+		__setscheduler_params(tsk, &attr);
+		__setscheduler_prio(tsk, normal_prio(tsk));
+	}
+}
+#endif
+
 static inline void alloc_uclamp_sched_group(struct task_group *tg,
 					    struct task_group *parent)
 {
@@ -10358,6 +10399,11 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 
+#ifdef CONFIG_QOS_SCHED
+	if (!alloc_qos_sched_group(tg, parent))
+		goto err;
+#endif
+
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
@@ -10444,6 +10490,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
 {
 	tsk->sched_task_group = group;
 
+#ifdef CONFIG_QOS_SCHED
+	sched_change_qos_group(tsk, group);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_change_group)
 		tsk->sched_class->task_change_group(tsk);
@@ -11146,6 +11196,65 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_QOS_SCHED
+static int tg_change_scheduler(struct task_group *tg, void *data)
+{
+	int policy;
+	struct css_task_iter it;
+	struct sched_param param;
+	struct task_struct *tsk;
+	s64 qos_level = *(s64 *)data;
+	struct cgroup_subsys_state *css = &tg->css;
+
+	tg->qos_level = qos_level;
+	if (qos_level == -1) {
+		policy = SCHED_IDLE;
+		cfs_bandwidth_usage_inc();
+	} else {
+		policy = SCHED_NORMAL;
+		cfs_bandwidth_usage_dec();
+	}
+
+	param.sched_priority = 0;
+	css_task_iter_start(css, 0, &it);
+	while ((tsk = css_task_iter_next(&it)))
+		sched_setscheduler(tsk, policy, &param);
+	css_task_iter_end(&it);
+
+	return 0;
+}
+
+static int cpu_qos_write(struct cgroup_subsys_state *css,
+			 struct cftype *cftype, s64 qos_level)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (!tg->se[0])
+		return -EINVAL;
+
+	if (qos_level != -1 && qos_level != 0)
+		return -EINVAL;
+
+	if (tg->qos_level == qos_level)
+		goto done;
+
+	if (tg->qos_level == -1 && qos_level == 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level));
+	rcu_read_unlock();
+done:
+	return 0;
+}
+
+static inline s64 cpu_qos_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return css_tg(css)->qos_level;
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -11205,6 +11314,13 @@ static struct cftype cpu_legacy_files[] = {
 		.seq_show = cpu_uclamp_max_show,
 		.write = cpu_uclamp_max_write,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED
+	{
+		.name = "qos_level",
+		.read_s64 = cpu_qos_read,
+		.write_s64 = cpu_qos_write,
+	},
 #endif
 	{ }	/* Terminate */
 };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..5a8afed85a10 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,10 @@ struct task_group {
 
 	struct cfs_bandwidth	cfs_bandwidth;
 
+#ifdef CONFIG_QOS_SCHED
+	long qos_level;
+#endif
+
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 	/* The two decimal precision [%] value requested from user-space */
 	unsigned int		uclamp_pct[UCLAMP_CNT];
-- 
Gitee


From 0992921d86b3bfe778c7aec0c4a594e19a0f945e Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:49 +0800
Subject: [PATCH 02/15] sched: Throttle qos cfs_rq when current cpu is running
 online task

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

In a co-location scenario, we usually deploy online and
offline task groups in the same server.

The online tasks are more important than offline tasks and to
avoid offline tasks affects online tasks, we will throttle the
offline tasks group when some online task groups running in the
same cpu and unthrottle offline tasks when the cpu is about to
enter idle state.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 145 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf9933c0158a..94290eefba36 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -164,6 +164,10 @@ int __weak arch_asym_cpu_priority(int cpu)
 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
 #endif
 
+#ifdef CONFIG_QOS_SCHED
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -8097,6 +8101,124 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		set_last_buddy(se);
 }
 
+#ifdef CONFIG_QOS_SCHED
+static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se;
+	long task_delta, idle_task_delta, dequeue = 1;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	/* freeze hierarchy runnable averages while throttled */
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	rcu_read_unlock();
+
+	task_delta = cfs_rq->h_nr_running;
+	idle_task_delta = cfs_rq->idle_h_nr_running;
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			break;
+
+		if (dequeue)
+			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		qcfs_rq->h_nr_running -= task_delta;
+		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+
+		if (qcfs_rq->load.weight)
+			dequeue = 0;
+	}
+
+	if (!se)
+		sub_nr_running(rq, task_delta);
+
+	cfs_rq->throttled = 1;
+	cfs_rq->throttled_clock = rq_clock(rq);
+
+	list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq)));
+}
+
+static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	int enqueue = 1;
+	long task_delta, idle_task_delta;
+
+	se = cfs_rq->tg->se[cpu_of(rq)];
+
+	cfs_rq->throttled = 0;
+
+	update_rq_clock(rq);
+
+	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+	list_del_init(&cfs_rq->throttled_list);
+
+	/* update hierarchical throttle state */
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+	if (!cfs_rq->load.weight)
+		return;
+
+	task_delta = cfs_rq->h_nr_running;
+	idle_task_delta = cfs_rq->idle_h_nr_running;
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			enqueue = 0;
+
+		cfs_rq = cfs_rq_of(se);
+		if (enqueue)
+			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		cfs_rq->h_nr_running += task_delta;
+		cfs_rq->idle_h_nr_running += idle_task_delta;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	assert_list_leaf_cfs_rq(rq);
+
+	if (!se)
+		add_nr_running(rq, task_delta);
+
+	/* Determine whether we need to wake up potentially idle CPU: */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_curr(rq);
+}
+
+static int unthrottle_qos_cfs_rqs(int cpu)
+{
+	struct cfs_rq *cfs_rq, *tmp_rq;
+	int res = 0;
+
+	list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu),
+				throttled_list) {
+		if (cfs_rq_throttled(cfs_rq)) {
+			unthrottle_qos_cfs_rq(cfs_rq);
+			res++;
+		}
+	}
+
+	return res;
+}
+
+static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
+		     !sched_idle_cpu(smp_processor_id()) &&
+		     cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
+		throttle_qos_cfs_rq(cfs_rq);
+		return true;
+	}
+
+	return false;
+}
+#endif
+
 #ifdef CONFIG_SMP
 static struct task_struct *pick_task_fair(struct rq *rq)
 {
@@ -8187,6 +8309,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 		se = pick_next_entity(cfs_rq, curr);
 		cfs_rq = group_cfs_rq(se);
+#ifdef CONFIG_QOS_SCHED
+		if (check_qos_cfs_rq(cfs_rq)) {
+			cfs_rq = &rq->cfs;
+			WARN(cfs_rq->nr_running == 0,
+			    "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n",
+			    rq->nr_running, cfs_rq->idle_h_nr_running);
+			if (unlikely(!cfs_rq->nr_running))
+				return NULL;
+		}
+#endif
 	} while (cfs_rq);
 
 	p = task_of(se);
@@ -8265,6 +8397,12 @@ done: __maybe_unused;
 	if (new_tasks > 0)
 		goto again;
 
+#ifdef CONFIG_QOS_SCHED
+	if (unthrottle_qos_cfs_rqs(cpu_of(rq))) {
+		rq->idle_stamp = 0;
+		goto again;
+	}
+#endif
 	/*
 	 * rq is about to be idle, check if we need to update the
 	 * lost_idle_time of clock_pelt
@@ -12884,6 +13022,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 
 __init void init_sched_fair_class(void)
 {
+#ifdef CONFIG_QOS_SCHED
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
+#endif
+
 #ifdef CONFIG_SMP
 	int i;
 
-- 
Gitee


From d584ec43b78139d1dd93934c0a5696f50293766a Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:50 +0800
Subject: [PATCH 03/15] sched: Fix offline task can't be killed in a timely

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

If online tasks occupy 100% CPU resources, offline tasks can't be scheduled
since offline tasks are throttled, as a result, offline task can't timely
respond after receiving SIGKILL signal.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 include/linux/cgroup.h |  4 ++++
 include/linux/sched.h  |  4 ++++
 kernel/cgroup/cgroup.c | 22 ++++++++++++++++++++++
 kernel/sched/core.c    | 32 ++++++++++++++++++++++++++++++++
 kernel/signal.c        |  3 +++
 5 files changed, 65 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 885f5395fcd0..b400d8d278c0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -857,4 +857,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #endif /* CONFIG_CGROUP_BPF */
 
+#ifdef CONFIG_QOS_SCHED
+void cgroup_move_task_to_root(struct task_struct *tsk);
+#endif
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2bdd7156564a..bdc4db75ff1c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2467,4 +2467,8 @@ static inline bool dynamic_affinity_enabled(void)
 	return static_branch_unlikely(&__dynamic_affinity_switch);
 }
 #endif
+#ifdef CONFIG_QOS_SCHED
+void sched_move_offline_task(struct task_struct *p);
+#endif
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4d42f0cbc11e..d65411063781 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3013,6 +3013,28 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked
 			ss->post_attach();
 }
 
+#ifdef CONFIG_QOS_SCHED
+void cgroup_move_task_to_root(struct task_struct *tsk)
+{
+	struct css_set *css;
+	struct cgroup *cpu_cgrp;
+	struct cgroup *cpu_root_cgrp;
+
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	spin_lock_irq(&css_set_lock);
+	css = task_css_set(tsk);
+	cpu_cgrp = css->subsys[cpu_cgrp_id]->cgroup;
+	cpu_root_cgrp = &cpu_cgrp->root->cgrp;
+	spin_unlock_irq(&css_set_lock);
+
+	(void)cgroup_attach_task(cpu_root_cgrp, tsk, false);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif
+
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
 {
 	struct cgroup_subsys *ss;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ac7abb6c5c64..2309a14e81de 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10347,6 +10347,38 @@ static void sched_change_qos_group(struct task_struct *tsk, struct task_group *t
 		__setscheduler_prio(tsk, normal_prio(tsk));
 	}
 }
+
+struct offline_args {
+	struct work_struct work;
+	struct task_struct *p;
+};
+
+static void sched_move_work(struct work_struct *work)
+{
+	struct sched_param param = { .sched_priority = 0 };
+	struct offline_args *args = container_of(work, struct offline_args, work);
+
+	cgroup_move_task_to_root(args->p);
+	sched_setscheduler(args->p, SCHED_NORMAL, &param);
+	put_task_struct(args->p);
+	kfree(args);
+}
+
+void sched_move_offline_task(struct task_struct *p)
+{
+	struct offline_args *args;
+
+	if (unlikely(task_group(p)->qos_level != -1))
+		return;
+
+	args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC);
+	if (args) {
+		get_task_struct(p);
+		args->p = p;
+		INIT_WORK(&args->work, sched_move_work);
+		queue_work(system_highpri_wq, &args->work);
+	}
+}
 #endif
 
 static inline void alloc_uclamp_sched_group(struct task_group *tg,
diff --git a/kernel/signal.c b/kernel/signal.c
index 2547fa73bde5..e50383449808 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1054,6 +1054,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 			signal->group_stop_count = 0;
 			t = p;
 			do {
+#ifdef CONFIG_QOS_SCHED
+				sched_move_offline_task(t);
+#endif
 				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
-- 
Gitee


From 0c52eb39ca30e0fe971d33025166daacf53859c3 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:51 +0800
Subject: [PATCH 04/15] sched: Unthrottle the throttled cfs rq when offline rq

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

In cpu hotplug case, when a cpu go to offline, we should unthrottle cfs_rq
which be throttled on this cpu, so they will be migrated to online cpu.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94290eefba36..eff81e480604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -166,6 +166,7 @@ int __weak arch_asym_cpu_priority(int cpu)
 
 #ifdef CONFIG_QOS_SCHED
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
+static int unthrottle_qos_cfs_rqs(int cpu);
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -6141,6 +6142,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 			unthrottle_cfs_rq(cfs_rq);
 	}
 	rcu_read_unlock();
+
+#ifdef CONFIG_QOS_SCHED
+	unthrottle_qos_cfs_rqs(cpu_of(rq));
+#endif
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-- 
Gitee


From fc549e93f14dad1f429b74e402fe9cacb7ff82e3 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:52 +0800
Subject: [PATCH 05/15] sched: Unthrottle qos cfs rq when free a task group

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

When freeing a taskgroup, we will free cfs rqs of the group, even if
cfs rqs have been throttled, otherwise it will cause a UAF Bug.
Therefore before freeing a taskgroup, we should unthrottle all cfs
rqs belonging to the taskgroup.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eff81e480604..fee251393313 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8222,6 +8222,17 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	return false;
 }
+
+static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct rq_flags rf;
+
+	rq_lock_irqsave(rq, &rf);
+	if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq))
+		unthrottle_qos_cfs_rq(cfs_rq);
+	rq_unlock_irqrestore(rq, &rf);
+}
 #endif
 
 #ifdef CONFIG_SMP
@@ -12653,6 +12664,10 @@ void free_fair_sched_group(struct task_group *tg)
 	int i;
 
 	for_each_possible_cpu(i) {
+#ifdef CONFIG_QOS_SCHED
+		if (tg->cfs_rq)
+			unthrottle_qos_sched_group(tg->cfs_rq[i]);
+#endif
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
-- 
Gitee


From 46469707e66b730189fa6c3884d9c733a6d4ae25 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:53 +0800
Subject: [PATCH 06/15] sched: bugfix setscheduler unlock cpuset_rwsem

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2309a14e81de..b82bddac1352 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7687,8 +7687,8 @@ static int __sched_setscheduler(struct task_struct *p,
 	 * isolation will be invalid, so return -EINVAL in this case.
 	 */
 	if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) {
-		task_rq_unlock(rq, p, &rf);
-		return -EINVAL;
+		retval = -EINVAL;
+		goto unlock;
 	}
 #endif
 
-- 
Gitee


From 7d0a7e79a7114ac65b9fbab1c9d178fc2fe52197 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:54 +0800
Subject: [PATCH 07/15] sched: Introduce handle priority reversion mechanism

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

When online tasks occupy cpu long time, offline task will not get cpu
to run, the priority inversion issue may be triggered in this case.
If the above case occurs, we will unthrottle offline tasks and let its
get a chance to run.
When online tasks occupy cpu over 5s(defaule value), we will unthrottle
offline tasks and enter a msleep loop before exit to usermode util the
cpu goto idle.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 include/linux/sched.h        |  7 +++
 include/linux/sched/sysctl.h |  5 +++
 kernel/entry/common.c        |  7 ++-
 kernel/sched/core.c          |  3 ++
 kernel/sched/fair.c          | 84 ++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h         |  4 ++
 kernel/sysctl.c              | 24 +++++++++++
 7 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bdc4db75ff1c..6db0879089df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2469,6 +2469,13 @@ static inline bool dynamic_affinity_enabled(void)
 #endif
 #ifdef CONFIG_QOS_SCHED
 void sched_move_offline_task(struct task_struct *p);
+void sched_qos_offline_wait(void);
+int sched_qos_cpu_overload(void);
+#else
+static inline int sched_qos_cpu_overload(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index ede157a678f8..28d9be8e4614 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -32,4 +32,9 @@ extern int sysctl_numa_balancing_mode;
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 extern int sysctl_sched_util_low_pct;
 #endif
+#ifdef CONFIG_QOS_SCHED
+extern unsigned int sysctl_overload_detect_period;
+extern unsigned int sysctl_offline_wait_interval;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..e3df7fdfd901 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -170,6 +170,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_NOTIFY_RESUME)
 			resume_user_mode_work(regs);
 
+#ifdef CONFIG_QOS_SCHED
+		sched_qos_offline_wait();
+#endif
+
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
 
@@ -200,7 +204,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 	tick_nohz_user_enter_prepare();
 
 	ti_work = read_thread_flags();
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+	if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) ||
+		      sched_qos_cpu_overload()))
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b82bddac1352..f72cd213a784 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9981,6 +9981,9 @@ void __init sched_init(void)
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
+#ifdef CONFIG_QOS_SCHED
+		init_qos_hrtimer(i);
+#endif
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fee251393313..1fe68608877a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -56,6 +56,10 @@
 #include "stats.h"
 #include "autogroup.h"
 
+#ifdef CONFIG_QOS_SCHED
+#include <linux/delay.h>
+#endif
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -166,6 +170,10 @@ int __weak arch_asym_cpu_priority(int cpu)
 
 #ifdef CONFIG_QOS_SCHED
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer);
+static DEFINE_PER_CPU(int, qos_cpu_overload);
+unsigned int sysctl_overload_detect_period = 5000;  /* in ms */
+unsigned int sysctl_offline_wait_interval = 100;  /* in ms */
 static int unthrottle_qos_cfs_rqs(int cpu);
 #endif
 
@@ -8107,6 +8115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 }
 
 #ifdef CONFIG_QOS_SCHED
+static void start_qos_hrtimer(int cpu);
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -8140,6 +8149,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		sub_nr_running(rq, task_delta);
 
+	if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq))))
+		start_qos_hrtimer(cpu_of(rq));
+
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 
@@ -8195,7 +8207,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		resched_curr(rq);
 }
 
-static int unthrottle_qos_cfs_rqs(int cpu)
+static int __unthrottle_qos_cfs_rqs(int cpu)
 {
 	struct cfs_rq *cfs_rq, *tmp_rq;
 	int res = 0;
@@ -8211,11 +8223,25 @@ static int unthrottle_qos_cfs_rqs(int cpu)
 	return res;
 }
 
+static int unthrottle_qos_cfs_rqs(int cpu)
+{
+	int res;
+
+	res = __unthrottle_qos_cfs_rqs(cpu);
+	if (res)
+		hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu)));
+
+	return res;
+}
+
 static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
+	if (unlikely(__this_cpu_read(qos_cpu_overload)))
+		return false;
+
 	if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 &&
-		     !sched_idle_cpu(smp_processor_id()) &&
-		     cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
+		!sched_idle_cpu(smp_processor_id()) &&
+		cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) {
 		throttle_qos_cfs_rq(cfs_rq);
 		return true;
 	}
@@ -8233,6 +8259,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq)
 		unthrottle_qos_cfs_rq(cfs_rq);
 	rq_unlock_irqrestore(rq, &rf);
 }
+
+void sched_qos_offline_wait(void)
+{
+	long qos_level;
+
+	while (unlikely(this_cpu_read(qos_cpu_overload))) {
+		rcu_read_lock();
+		qos_level = task_group(current)->qos_level;
+		rcu_read_unlock();
+		if (qos_level != -1 || signal_pending(current))
+			break;
+		msleep_interruptible(sysctl_offline_wait_interval);
+	}
+}
+
+int sched_qos_cpu_overload(void)
+{
+	return __this_cpu_read(qos_cpu_overload);
+}
+
+static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer)
+{
+	struct rq_flags rf;
+	struct rq *rq = this_rq();
+
+	rq_lock_irqsave(rq, &rf);
+	if (__unthrottle_qos_cfs_rqs(smp_processor_id()))
+		__this_cpu_write(qos_cpu_overload, 1);
+	rq_unlock_irqrestore(rq, &rf);
+
+	return HRTIMER_NORESTART;
+}
+
+static void start_qos_hrtimer(int cpu)
+{
+	ktime_t time;
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period);
+	hrtimer_set_expires(hrtimer, time);
+	hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED);
+}
+
+void init_qos_hrtimer(int cpu)
+{
+	struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu));
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	hrtimer->function = qos_overload_timer_handler;
+}
 #endif
 
 #ifdef CONFIG_SMP
@@ -8418,6 +8494,8 @@ done: __maybe_unused;
 		rq->idle_stamp = 0;
 		goto again;
 	}
+
+	__this_cpu_write(qos_cpu_overload, 0);
 #endif
 	/*
 	 * rq is about to be idle, check if we need to update the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5a8afed85a10..feafe8661075 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1401,6 +1401,10 @@ do {						\
 	flags = _raw_spin_rq_lock_irqsave(rq);	\
 } while (0)
 
+#ifdef CONFIG_QOS_SCHED
+void init_qos_hrtimer(int cpu);
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 extern void __update_idle_core(struct rq *rq);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acc20b417dc8..e9af234bf882 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
 static const int six_hundred_forty_kb = 640 * 1024;
 #endif
 
+#ifdef CONFIG_QOS_SCHED
+static int one_thousand = 1000;
+static int hundred_thousand = 100000;
+#endif
 
 static const int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -2045,6 +2049,26 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+#ifdef CONFIG_QOS_SCHED
+	{
+		.procname	= "qos_overload_detect_period_ms",
+		.data		= &sysctl_overload_detect_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE_HUNDRED,
+		.extra2		= &hundred_thousand,
+	},
+	{
+		.procname	= "qos_offline_wait_interval_ms",
+		.data		= &sysctl_offline_wait_interval,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE_HUNDRED,
+		.extra2		= &one_thousand,
+	},
+#endif
 	{
 		.procname	= "max_rcu_stall_to_panic",
 		.data		= &sysctl_max_rcu_stall_to_panic,
-- 
Gitee


From fa5bd62e9bea221fbfa8cfd94a7951145f737dea Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:55 +0800
Subject: [PATCH 08/15] sched: enable CONFIG_QOS_SCHED on arm64

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 arch/arm64/configs/openeuler_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 081a223bc65b..c16cb045d3a4 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -176,6 +176,7 @@ CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
 CONFIG_CGROUP_BPF=y
 # CONFIG_CGROUP_MISC is not set
+CONFIG_QOS_SCHED=y
 # CONFIG_CGROUP_DEBUG is not set
 CONFIG_SOCK_CGROUP_DATA=y
 CONFIG_NAMESPACES=y
-- 
Gitee


From 9f900b04847ef122c5273dcefc02424980468428 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:56 +0800
Subject: [PATCH 09/15] sched: Fix sleeping in atomic context at
 cpu_qos_write()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

cfs_bandwidth_usage_inc() need hold jump_label_mutex and
might sleep, so we can not call it in atomic context.
Fix this by moving cfs_bandwidth_usage_{inc,dec}() out of
rcu read critical section.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/core.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f72cd213a784..08ce8aada0b0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11242,13 +11242,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data)
 	struct cgroup_subsys_state *css = &tg->css;
 
 	tg->qos_level = qos_level;
-	if (qos_level == -1) {
+	if (qos_level == -1)
 		policy = SCHED_IDLE;
-		cfs_bandwidth_usage_inc();
-	} else {
+	else
 		policy = SCHED_NORMAL;
-		cfs_bandwidth_usage_dec();
-	}
 
 	param.sched_priority = 0;
 	css_task_iter_start(css, 0, &it);
@@ -11276,6 +11273,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css,
 	if (tg->qos_level == -1 && qos_level == 0)
 		return -EINVAL;
 
+	cpus_read_lock();
+	if (qos_level == -1)
+		cfs_bandwidth_usage_inc();
+	else
+		cfs_bandwidth_usage_dec();
+	cpus_read_unlock();
+
 	rcu_read_lock();
 	walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level));
 	rcu_read_unlock();
-- 
Gitee


From 432f11e7323328aebc351562e062a8bd264f63d6 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:57 +0800
Subject: [PATCH 10/15] sched/fair: Add qos_throttle_list node in struct cfs_rq

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

-----------------------------------------------------------------

when unthrottle a cfs_rq at distribute_cfs_runtime(), another cpu
may re-throttle this cfs_rq at qos_throttle_cfs_rq() before access
the cfs_rq->throttle_list.next, but meanwhile, qos throttle will
attach the cfs_rq throttle_list node to percpu qos_throttled_cfs_rq,
it will change cfs_rq->throttle_list.next and cause panic or hardlockup
at distribute_cfs_runtime().

Fix it by adding a qos_throttle_list node in struct cfs_rq, and qos
throttle disuse the cfs_rq->throttle_list.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c  | 10 +++++++---
 kernel/sched/sched.h |  4 ++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1fe68608877a..cf75caae5911 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6045,6 +6045,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
 #endif
+#ifdef CONFIG_QOS_SCHED
+	INIT_LIST_HEAD(&cfs_rq->qos_throttled_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -8155,7 +8158,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 
-	list_add(&cfs_rq->throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq)));
+	list_add(&cfs_rq->qos_throttled_list,
+		 &per_cpu(qos_throttled_cfs_rq, cpu_of(rq)));
 }
 
 static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
@@ -8173,7 +8177,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 	update_rq_clock(rq);
 
 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
-	list_del_init(&cfs_rq->throttled_list);
+	list_del_init(&cfs_rq->qos_throttled_list);
 
 	/* update hierarchical throttle state */
 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -8213,7 +8217,7 @@ static int __unthrottle_qos_cfs_rqs(int cpu)
 	int res = 0;
 
 	list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu),
-				throttled_list) {
+				 qos_throttled_list) {
 		if (cfs_rq_throttled(cfs_rq)) {
 			unthrottle_qos_cfs_rq(cfs_rq);
 			res++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index feafe8661075..7393c1a62513 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -654,6 +654,10 @@ struct cfs_rq {
 #endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#if defined(CONFIG_QOS_SCHED)
+	struct list_head	qos_throttled_list;
+#endif
 };
 
 static inline int rt_bandwidth_enabled(void)
-- 
Gitee


From 505a134aa527337f885a648c65a4d0e4c8398509 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:58 +0800
Subject: [PATCH 11/15] sched: Throttle offline task at
 tracehook_notify_resume()

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

Before, when detect the cpu is overloaded, we throttle offline
tasks at exit_to_user_mode_loop() before returning to user mode.
Some architects(e.g.,arm64) do not support QOS scheduler because
a task do not via exit_to_user_mode_loop() return to userspace at
these platforms.
In order to slove this problem and support qos scheduler on all
architectures, if we require throttling offline tasks, we set flag
TIF_NOTIFY_RESUME to an offline task when it is picked and throttle
it at tracehook_notify_resume().

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 include/linux/resume_user_mode.h |  5 +++++
 kernel/entry/common.c            |  7 +------
 kernel/sched/fair.c              | 33 ++++++++++++++++++++++++++++++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index 285189454449..a868cb5f4303 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,6 +59,11 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
 	blkcg_maybe_throttle_current();
 
 	rseq_handle_notify_resume(NULL, regs);
+
+#ifdef CONFIG_QOS_SCHED
+	sched_qos_offline_wait();
+#endif
+
 }
 
 #endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e3df7fdfd901..be61332c66b5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -170,10 +170,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_NOTIFY_RESUME)
 			resume_user_mode_work(regs);
 
-#ifdef CONFIG_QOS_SCHED
-		sched_qos_offline_wait();
-#endif
-
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
 
@@ -204,8 +200,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 	tick_nohz_user_enter_prepare();
 
 	ti_work = read_thread_flags();
-	if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) ||
-		      sched_qos_cpu_overload()))
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf75caae5911..ecb9dab2d1ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -58,6 +58,7 @@
 
 #ifdef CONFIG_QOS_SCHED
 #include <linux/delay.h>
+#include <linux/resume_user_mode.h>
 #endif
 
 /*
@@ -8118,6 +8119,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 }
 
 #ifdef CONFIG_QOS_SCHED
+
+static inline bool is_offline_task(struct task_struct *p)
+{
+	return task_group(p)->qos_level == QOS_LEVEL_OFFLINE;
+}
+
 static void start_qos_hrtimer(int cpu);
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
@@ -8272,9 +8279,10 @@ void sched_qos_offline_wait(void)
 		rcu_read_lock();
 		qos_level = task_group(current)->qos_level;
 		rcu_read_unlock();
-		if (qos_level != -1 || signal_pending(current))
+		if (qos_level != -1 || fatal_signal_pending(current))
 			break;
-		msleep_interruptible(sysctl_offline_wait_interval);
+
+		schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval));
 	}
 }
 
@@ -8313,6 +8321,23 @@ void init_qos_hrtimer(int cpu)
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	hrtimer->function = qos_overload_timer_handler;
 }
+
+/*
+ * To avoid Priority inversion issues, when this cpu is qos_cpu_overload,
+ * we should schedule offline tasks to run so that they can leave kernel
+ * critical sections, and throttle them before returning to user mode.
+ */
+static void qos_schedule_throttle(struct task_struct *p)
+{
+	if (unlikely(current->flags & PF_KTHREAD))
+		return;
+
+	if (unlikely(this_cpu_read(qos_cpu_overload))) {
+		if (is_offline_task(p))
+			set_notify_resume(p);
+	}
+}
+
 #endif
 
 #ifdef CONFIG_SMP
@@ -8474,6 +8499,10 @@ done: __maybe_unused;
 
 	update_misfit_status(p, rq);
 
+#ifdef CONFIG_QOS_SCHED
+	qos_schedule_throttle(p);
+#endif
+
 	return p;
 
 idle:
-- 
Gitee


From 1de5d5f0f391776267cd71cc4da787aade3c7537 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:57:59 +0800
Subject: [PATCH 12/15] sched/qos: Add qos_tg_{throttle,unthrottle}_{up,down}

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

1. Qos throttle reuse tg_{throttle,unthrottle}_{up,down} that
can write some cfs-bandwidth fields, it may cause some unknown
data error. So add qos_tg_{throttle,unthrottle}_{up,down} for
qos throttle.

2. walk_tg_tree_from() caller must hold rcu_lock, currently there is
none, so add it now.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecb9dab2d1ed..bf5aa616ee0d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6132,6 +6132,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 
 	lockdep_assert_rq_held(rq);
 
+#ifdef CONFIG_QOS_SCHED
+	unthrottle_qos_cfs_rqs(cpu_of(rq));
+#endif
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6154,10 +6158,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 			unthrottle_cfs_rq(cfs_rq);
 	}
 	rcu_read_unlock();
-
-#ifdef CONFIG_QOS_SCHED
-	unthrottle_qos_cfs_rqs(cpu_of(rq));
-#endif
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -8126,6 +8126,27 @@ static inline bool is_offline_task(struct task_struct *p)
 }
 
 static void start_qos_hrtimer(int cpu);
+
+static int qos_tg_unthrottle_up(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	cfs_rq->throttle_count--;
+
+	return 0;
+}
+
+static int qos_tg_throttle_down(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	cfs_rq->throttle_count++;
+
+	return 0;
+}
+
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -8136,7 +8157,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
-	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
 	task_delta = cfs_rq->h_nr_running;
@@ -8163,7 +8184,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		start_qos_hrtimer(cpu_of(rq));
 
 	cfs_rq->throttled = 1;
-	cfs_rq->throttled_clock = rq_clock(rq);
 
 	list_add(&cfs_rq->qos_throttled_list,
 		 &per_cpu(qos_throttled_cfs_rq, cpu_of(rq)));
@@ -8172,7 +8192,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
-	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta, idle_task_delta;
@@ -8181,13 +8200,12 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	cfs_rq->throttled = 0;
 
-	update_rq_clock(rq);
-
-	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 	list_del_init(&cfs_rq->qos_throttled_list);
 
 	/* update hierarchical throttle state */
-	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq);
+	rcu_read_unlock();
 
 	if (!cfs_rq->load.weight)
 		return;
-- 
Gitee


From 969507e9f3f5243fdf68aa1c6db879b8c4b0a5c5 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:58:00 +0800
Subject: [PATCH 13/15] sched/fair: Update rq clock before unthrottle a qos
 cfs_rq

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

------------[ cut here]------------
 rq->clock_update_flags < RQCF_ACT_SKIP
 WARNING: CPU: 5 PID: 3312 at kernel/sched/sched.h:1223 update_curr+0x1e5/0x210
 CPU: 5 PID: 3312 Comm: a.out Tainted: G S5.10.0.zq+ #1
 Hardware name: Huawei RH2288H V3/BC11HGSA0, BIOS 3.35 10/20/2016
 RIP: 0010:update_curr+0x1e5/0x210
 enqueue_entity+0x378/0xd00
 unthrottle_qos_cfs_rq+0x1bc/0x2a0
 __unthrottle_qos_cfs_rqs+0x87/0xa0
 qos_overload_timer_handler+0x35/0x60
 __run_hrtimer+0x5e/0x190
 __hrtimer_run_queues+0x81/0xe0
 hrtimer_interrupt+0x110/0x2c0
 __sysvec_apic_timer_interrupt+0x5f/0xd0
 sysvec_apic_timer_interrupt+0x31/0x80
 asm_sysvec_apic_timer_interrupt+0x12/0x20

After the last rq_pin_lock(), there is no rq clock update before calling
enqueue_entity() at unthrottle_qos_cfs_rq();

This patch fixes it by updating rq clock before calling enqueue_entity().

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf5aa616ee0d..2add3d9c19b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8200,6 +8200,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	cfs_rq->throttled = 0;
 
+	update_rq_clock(rq);
 	list_del_init(&cfs_rq->qos_throttled_list);
 
 	/* update hierarchical throttle state */
-- 
Gitee


From 08e4707c331b4d76dd06cece40da23366b8c69b2 Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:58:01 +0800
Subject: [PATCH 14/15] sched/qos: Don't unthrottle cfs_rq when cfs_rq is
 throttled by qos

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

-------------------------------

When a cfs_rq throttled by qos, mark cfs_rq->throttled as 1,
and cfs bw will unthrottled this cfs_rq by mistake, it cause
a list_del_valid warning.
So add macro QOS_THROTTLED(=2), when a cfs_rq is throttled by
qos, we mark the cfs_rq->throttled as QOS_THROTTLED, will check
the value of cfs_rq->throttled before unthrottle a cfs_rq.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 kernel/sched/fair.c | 139 ++++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 49 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2add3d9c19b7..bc58182b201f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -170,6 +170,13 @@ int __weak arch_asym_cpu_priority(int cpu)
 #endif
 
 #ifdef CONFIG_QOS_SCHED
+
+/*
+ * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled
+ * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1).
+ */
+#define QOS_THROTTLED	2
+
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer);
 static DEFINE_PER_CPU(int, qos_cpu_overload);
@@ -5507,6 +5514,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
+#ifdef CONFIG_QOS_SCHED
+	/*
+	 * if this cfs_rq throttled by qos, not need unthrottle it.
+	 */
+	if (cfs_rq->throttled == QOS_THROTTLED)
+		return;
+#endif
+
 	cfs_rq->throttled = 0;
 
 	update_rq_clock(rq);
@@ -8122,42 +8137,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
 static inline bool is_offline_task(struct task_struct *p)
 {
-	return task_group(p)->qos_level == QOS_LEVEL_OFFLINE;
+	return task_group(p)->qos_level == -1;
 }
 
 static void start_qos_hrtimer(int cpu);
 
-static int qos_tg_unthrottle_up(struct task_group *tg, void *data)
-{
-	struct rq *rq = data;
-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-
-	cfs_rq->throttle_count--;
-
-	return 0;
-}
-
-static int qos_tg_throttle_down(struct task_group *tg, void *data)
-{
-	struct rq *rq = data;
-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-
-	cfs_rq->throttle_count++;
-
-	return 0;
-}
-
 static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *se;
-	long task_delta, idle_task_delta, dequeue = 1;
+	long task_delta, idle_task_delta;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
-	walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq);
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
 	task_delta = cfs_rq->h_nr_running;
@@ -8166,24 +8161,44 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
-			break;
+			goto done;
+
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 
-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
 
-		if (qcfs_rq->load.weight)
-			dequeue = 0;
+		if (qcfs_rq->load.weight) {
+			/* Avoid re-evaluating load for this entity: */
+			se = parent_entity(se);
+			break;
+		}
 	}
 
-	if (!se)
-		sub_nr_running(rq, task_delta);
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			goto done;
+
+		update_load_avg(qcfs_rq, se, 0);
+		se_update_runnable(se);
+
+		if (cfs_rq_is_idle(group_cfs_rq(se)))
+			idle_task_delta = cfs_rq->h_nr_running;
+
+		qcfs_rq->h_nr_running -= task_delta;
+		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+	}
+
+	/* At this point se is NULL and we are at root level*/
+	sub_nr_running(rq, task_delta);
 
+done:
 	if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq))))
 		start_qos_hrtimer(cpu_of(rq));
 
-	cfs_rq->throttled = 1;
+	cfs_rq->throttled = QOS_THROTTLED;
 
 	list_add(&cfs_rq->qos_throttled_list,
 		 &per_cpu(qos_throttled_cfs_rq, cpu_of(rq)));
@@ -8193,11 +8208,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *se;
-	int enqueue = 1;
 	long task_delta, idle_task_delta;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
+	if (cfs_rq->throttled != QOS_THROTTLED)
+		return;
+
 	cfs_rq->throttled = 0;
 
 	update_rq_clock(rq);
@@ -8205,32 +8222,58 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/* update hierarchical throttle state */
 	rcu_read_lock();
-	walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq);
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 	rcu_read_unlock();
 
-	if (!cfs_rq->load.weight)
-		return;
+	if (!cfs_rq->load.weight) {
+		if (!cfs_rq->on_list)
+			return;
+		/*
+		 * Nothing to run but something to decay (on_list)?
+		 * Complete the branch.
+		 */
+		for_each_sched_entity(se) {
+			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+				break;
+		}
+		goto unthrottle_throttle;
+	}
 
 	task_delta = cfs_rq->h_nr_running;
 	idle_task_delta = cfs_rq->idle_h_nr_running;
 	for_each_sched_entity(se) {
 		if (se->on_rq)
-			enqueue = 0;
+			break;
 
 		cfs_rq = cfs_rq_of(se);
-		if (enqueue)
-			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
 		cfs_rq->h_nr_running += task_delta;
 		cfs_rq->idle_h_nr_running += idle_task_delta;
 
 		if (cfs_rq_throttled(cfs_rq))
-			break;
+			goto unthrottle_throttle;
 	}
 
-	assert_list_leaf_cfs_rq(rq);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
 
-	if (!se)
-		add_nr_running(rq, task_delta);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
+		se_update_runnable(se);
+
+		cfs_rq->h_nr_running += task_delta;
+		cfs_rq->idle_h_nr_running += idle_task_delta;
+
+		/* end evaluation on encountering a throttled cfs_rq */
+		if (cfs_rq_throttled(cfs_rq))
+			goto unthrottle_throttle;
+	}
+
+	add_nr_running(rq, task_delta);
+
+unthrottle_throttle:
+
+	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -13172,13 +13215,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 
 __init void init_sched_fair_class(void)
 {
-#ifdef CONFIG_QOS_SCHED
-	int i;
-
-	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
-#endif
-
 #ifdef CONFIG_SMP
 	int i;
 
@@ -13192,6 +13228,11 @@ __init void init_sched_fair_class(void)
 #endif
 	}
 
+#ifdef CONFIG_QOS_SCHED
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
+#endif
+
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
 #ifdef CONFIG_NO_HZ_COMMON
-- 
Gitee


From bc4c9ed70ea98670dba1925ad3d4ec3ec7191c0b Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Wed, 30 Aug 2023 20:58:02 +0800
Subject: [PATCH 15/15] sched: Enable qos scheduler config

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7WWMX
CVE: NA

--------------------------------

Enable CONFIG_QOS_SCHED to support qos scheduler.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
---
 arch/x86/configs/openeuler_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index f6140635690e..d553883c262d 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -185,6 +185,7 @@ CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
+CONFIG_QOS_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SCHED_MM_CID=y
 CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
-- 
Gitee