From 21089178193c6416748c4cfaebfe5864c3ce1ac8 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:38 +0000
Subject: [PATCH 1/9] sched: Introduce dynamic affinity for cfs scheduler

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Dynamic affinity set preferred cpus for task. When the utilization of
taskgroup's preferred cpu is low, task only run in cpus preferred to
enhance cpu resource locality and reduce interference between task cgroups,
otherwise task can burst preferred cpus to use external cpu within
cpus allowed.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 init/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 2ee1384c4f81..9bae5fd6641e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1031,6 +1031,16 @@ config RT_GROUP_SCHED
 
 endif #CGROUP_SCHED
 
+config QOS_SCHED_DYNAMIC_AFFINITY
+	bool "qos dynamic affinity"
+	depends on CPUSETS
+	default n
+	help
+	 This feature lets you allocate preferred cpus to taskgroup. If enabled,
+	 it will make taskgroup only to use preferred cpus when cpu utilization
+	 of taskgroup is below threshold setted, otherwise make taskgroup to use
+	 cpus allowed.
+
 config SCHED_MM_CID
 	def_bool y
 	depends on SMP && RSEQ
-- 
Gitee


From 72e926932f5ecab4ac9c66c8d09d2e534680e3f6 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:39 +0000
Subject: [PATCH 2/9] cpuset: Introduce new interface for scheduler dynamic
 affinity

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 fs/proc/base.c         |  73 ++++++++++++++++++++
 include/linux/sched.h  |  10 +++
 init/init_task.c       |   3 +
 kernel/cgroup/cpuset.c | 150 ++++++++++++++++++++++++++++++++++++++++-
 kernel/fork.c          |  13 ++++
 kernel/sched/core.c    |  95 ++++++++++++++++++++++++++
 6 files changed, 343 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..243c15919e18 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = {
 };
 #endif /* CONFIG_USER_NS */
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+
+static int preferred_cpuset_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (p->prefer_cpus)
+		seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
+	else
+		seq_putc(m, '\n');
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset)
+{
+	cpumask_var_t new_mask;
+	int retval;
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+
+	retval = cpumask_parselist_user(buf, count, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	retval = set_prefer_cpus_ptr(p, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	retval = count;
+
+out_free_cpumask:
+	free_cpumask_var(new_mask);
+out_put_task:
+	put_task_struct(p);
+
+	return retval;
+}
+
+static int preferred_cpuset_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, preferred_cpuset_show, inode);
+}
+
+static const struct file_operations proc_preferred_cpuset_operations = {
+	.open		= preferred_cpuset_open,
+	.write		= preferred_cpuset_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
@@ -3691,6 +3761,9 @@ static const struct pid_entry tid_base_stuff[] = {
 	ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
 	ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3520e3fbaa91..9ae33ae2b6e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1537,6 +1537,10 @@ struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_t			*prefer_cpus;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
@@ -2469,4 +2473,10 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int set_prefer_cpus_ptr(struct task_struct *p,
+			const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
+void sched_prefer_cpus_free(struct task_struct *p);
+#endif
 #endif
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b..ac0c5850f74b 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -207,6 +207,9 @@ struct task_struct init_task
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	.prefer_cpus	= NULL,
+#endif
 #ifdef CONFIG_SECCOMP_FILTER
 	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
 #endif
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4749e0c86c62..01f4ff02e7b2 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -115,6 +115,9 @@ struct cpuset {
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
 	nodemask_t mems_allowed;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif
 
 	/* effective CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t effective_cpus;
@@ -212,6 +215,9 @@ static inline bool is_prs_invalid(int prs_state)
 struct tmpmasks {
 	cpumask_var_t addmask, delmask;	/* For partition root */
 	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif
 };
 
 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	cpumask_var_t *pmask1, *pmask2, *pmask3;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t *pmask4;
+#endif
 
 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
 		pmask3 = &cs->subparts_cpus;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &cs->prefer_cpus;
+#endif
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
 		pmask3 = &tmp->delmask;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &tmp->prefer_cpus;
+#endif
 	}
 
 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 
 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
+		goto free_three;
+#endif
 
 	return 0;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+free_three:
+	free_cpumask_var(*pmask3);
+#endif
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	if (cs) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(cs->prefer_cpus);
+#endif
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
 		free_cpumask_var(cs->subparts_cpus);
 	}
 	if (tmp) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(tmp->prefer_cpus);
+#endif
 		free_cpumask_var(tmp->new_cpus);
 		free_cpumask_var(tmp->addmask);
 		free_cpumask_var(tmp->delmask);
@@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 		return NULL;
 	}
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(trial->prefer_cpus, cs->prefer_cpus);
+#endif
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	return trial;
@@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	if (cur == &top_cpuset)
 		goto out;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	ret = -EINVAL;
+	if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
+		goto out;
+#endif
+
 	par = parent_cs(cur);
 
 	/*
@@ -791,6 +829,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	return ret;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static cpumask_var_t prefer_cpus_attach;
+
+static void update_tasks_prefer_cpumask(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_prefer_cpus_ptr(task, cs->prefer_cpus);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
+ *			   all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				 const char *buf)
+{
+	int retval;
+
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	/*
+	 * An empty prefer_cpus is ok which mean that the cpuset tasks disable
+	 * dynamic affinity feature.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.
+	 */
+	if (!*buf) {
+		cpumask_clear(trialcs->prefer_cpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->prefer_cpus);
+		if (retval < 0)
+			return retval;
+	}
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
+		return 0;
+
+	if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
+		return -EINVAL;
+
+	update_tasks_prefer_cpumask(trialcs);
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
+	spin_unlock_irq(&callback_lock);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
@@ -2655,6 +2753,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 	 * fail.  TODO: have a better way to handle failure here
 	 */
 	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
+	set_prefer_cpus_ptr(task, prefer_cpus_attach);
+#endif
 
 	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 	cpuset_update_task_spread_flags(cs, task);
@@ -2762,6 +2864,9 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	FILE_DYNAMIC_CPULIST,
+#endif
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2892,6 +2997,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		retval = update_prefer_cpumask(cs, trialcs, buf);
+		break;
+#endif
 	default:
 		retval = -EINVAL;
 		break;
@@ -2939,6 +3049,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_SUBPARTS_CPULIST:
 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
+		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}
@@ -3161,7 +3276,15 @@ static struct cftype legacy_files[] = {
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE_ENABLED,
 	},
-
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.name = "preferred_cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_DYNAMIC_CPULIST,
+	},
+#endif
 	{ }	/* terminate */
 };
 
@@ -3327,6 +3450,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(cs->prefer_cpus, parent->prefer_cpus);
+#endif
 	spin_unlock_irq(&callback_lock);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
@@ -3480,6 +3606,9 @@ static void cpuset_fork(struct task_struct *task)
 			return;
 
 		set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		set_prefer_cpus_ptr(task, current->prefer_cpus);
+#endif
 		task->mems_allowed = current->mems_allowed;
 		return;
 	}
@@ -3526,17 +3655,26 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_clear(top_cpuset.prefer_cpus);
+#endif
 
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
 
 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL));
+#endif
 
 	return 0;
 }
@@ -3573,6 +3711,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
 			    bool cpus_updated, bool mems_updated)
 {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_t prefer_cpus;
+#endif
 	bool is_empty;
 
 	spin_lock_irq(&callback_lock);
@@ -3591,6 +3732,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	if (mems_updated && !nodes_empty(cs->mems_allowed))
 		update_tasks_nodemask(cs);
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
+		cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
+		cpumask_copy(cs->prefer_cpus, &prefer_cpus);
+		update_tasks_prefer_cpumask(cs);
+	}
+#endif
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		   nodes_empty(cs->mems_allowed);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 391d81cf0943..84d829613f6e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -625,6 +625,9 @@ void free_task(struct task_struct *tsk)
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
 	bpf_task_storage_free(tsk);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	sched_prefer_cpus_free(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1139,6 +1142,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->seccomp.filter = NULL;
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	tsk->prefer_cpus = NULL;
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
@@ -2357,6 +2364,12 @@ __latent_entropy struct task_struct *copy_process(
 
 	rt_mutex_init_task(p);
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
+	if (retval)
+		goto bad_fork_free;
+#endif
+
 	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1c73dea1f77..a1cebed8dae8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11570,6 +11570,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
 	return 0;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask)
+{
+	p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!p->prefer_cpus)
+		return -ENOMEM;
+
+	if (mask)
+		cpumask_copy(p->prefer_cpus, mask);
+	else
+		cpumask_clear(p->prefer_cpus);
+
+	return 0;
+}
+
+void sched_prefer_cpus_free(struct task_struct *p)
+{
+	kfree(p->prefer_cpus);
+}
+
+static void do_set_prefer_cpus(struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->__lock);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
+	cpumask_copy(p->prefer_cpus, new_mask);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
+}
+
+/*
+ * Change a given task's prefer CPU affinity. Prioritize migrate the thread to
+ * prefer cpus according to preferred bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
+				 const struct cpumask *new_mask)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!p->prefer_cpus))
+		return -EINVAL;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
+	if (cpumask_equal(p->prefer_cpus, new_mask))
+		goto out;
+
+	if (!cpumask_subset(new_mask, p->cpus_ptr)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_prefer_cpus(p, new_mask);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	return __set_prefer_cpus_ptr(p, new_mask);
+}
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static int cpu_max_show(struct seq_file *sf, void *v)
 {
-- 
Gitee


From 1c5626e158ba21a4caafe45027d1dea34298b107 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:40 +0000
Subject: [PATCH 3/9] sched: Adjust wakeup cpu range according CPU util
 dynamicly

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Compare taskgroup 'util_avg' in perferred cpu with capacity preferred cpu,
dynamicly adjust cpu range for task wakeup process.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 include/linux/sched.h        |   1 +
 include/linux/sched/sysctl.h |   3 +
 kernel/sched/fair.c          | 141 +++++++++++++++++++++++++++++++++++
 kernel/sched/features.h      |   7 ++
 kernel/sysctl.c              |  11 +++
 5 files changed, 163 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ae33ae2b6e9..de8f02515715 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1539,6 +1539,7 @@ struct task_struct {
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	cpumask_t			*prefer_cpus;
+	const cpumask_t			*select_cpus;
 #endif
 
 	/*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5a64582b086b..ede157a678f8 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -29,4 +29,7 @@ extern int sysctl_numa_balancing_mode;
 #define sysctl_numa_balancing_mode	0
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+extern int sysctl_sched_util_low_pct;
+#endif
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0de55884f9da..1757145bd6fd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7086,7 +7086,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	for_each_cpu_and(i, sched_group_span(group), p->select_cpus) {
+#else
 	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+#endif
 		struct rq *rq = cpu_rq(i);
 
 		if (!sched_core_cookie_match(rq, p))
@@ -7133,7 +7137,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = cpu;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus))
+#else
 	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
+#endif
 		return prev_cpu;
 
 	/*
@@ -7257,7 +7265,11 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 		if (!available_idle_cpu(cpu)) {
 			idle = false;
 			if (*idle_cpu == -1) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) {
+#else
 				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+#endif
 					*idle_cpu = cpu;
 					break;
 				}
@@ -7283,7 +7295,11 @@ static int select_idle_smt(struct task_struct *p, int target)
 {
 	int cpu;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	for_each_cpu_and(cpu, cpu_smt_mask(target), p->select_cpus) {
+#else
 	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+#endif
 		if (cpu == target)
 			continue;
 		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
@@ -7331,7 +7347,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	struct sched_domain *this_sd = NULL;
 	u64 time = 0;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_and(cpus, sched_domain_span(sd), p->select_cpus);
+#else
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+#endif
 
 	if (sched_feat(SIS_PROP) && !has_idle_core) {
 		u64 avg_cost, avg_idle, span_avg;
@@ -7504,6 +7524,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	lockdep_assert_irqs_disabled();
 
 	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	    cpumask_test_cpu(target, p->select_cpus) &&
+#endif
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
 
@@ -7512,6 +7535,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	    cpumask_test_cpu(prev, p->select_cpus) &&
+#endif
 	    asym_fits_cpu(task_util, util_min, util_max, prev))
 		return prev;
 
@@ -7538,7 +7564,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	    cpumask_test_cpu(recent_used_cpu, p->select_cpus) &&
+#else
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+#endif
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 		return recent_used_cpu;
 	}
@@ -8073,6 +8103,90 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	return target;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+/*
+ * Low utilization threshold for CPU
+ *
+ * (default: 85%), units: percentage of CPU utilization)
+ */
+int sysctl_sched_util_low_pct = 85;
+
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+	return p->prefer_cpus &&
+	       !cpumask_empty(p->prefer_cpus) &&
+	       !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
+	       cpumask_subset(p->prefer_cpus, p->cpus_ptr);
+}
+
+static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
+					       int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tg->se[cpu] && sched_feat(DA_UTIL_TASKGROUP))
+		return tg->se[cpu]->avg.util_avg;
+#endif
+	return cpu_util_cfs(cpu);
+}
+
+/*
+ * set_task_select_cpus: select the cpu range for task
+ * @p: the task whose available cpu range will to set
+ * @idlest_cpu: the cpu which is the idlest in prefer cpus
+ *
+ * If sum of 'util_avg' among 'prefer_cpus' lower than the percentage
+ * 'sysctl_sched_util_low_pct' of 'prefer_cpus' capacity, select
+ * 'prefer_cpus' range for task, otherwise select 'cpus_ptr' for task.
+ *
+ * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus
+ * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu
+ * without p->select_cpus.
+ */
+static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
+				 int sd_flag)
+{
+	unsigned long util_avg_sum = 0;
+	unsigned long tg_capacity = 0;
+	long min_util = INT_MIN;
+	struct task_group *tg;
+	long spare;
+	int cpu;
+
+	p->select_cpus = p->cpus_ptr;
+	if (!prefer_cpus_valid(p))
+		return;
+
+	rcu_read_lock();
+	tg = task_group(p);
+	for_each_cpu(cpu, p->prefer_cpus) {
+		if (idlest_cpu && available_idle_cpu(cpu)) {
+			*idlest_cpu = cpu;
+		} else if (idlest_cpu) {
+			spare = (long)(capacity_of(cpu) -
+				taskgroup_cpu_util(tg, cpu));
+			if (spare > min_util) {
+				min_util = spare;
+				*idlest_cpu = cpu;
+			}
+		}
+
+		if (available_idle_cpu(cpu)) {
+			rcu_read_unlock();
+			p->select_cpus = p->prefer_cpus;
+			return;
+		}
+
+		util_avg_sum += taskgroup_cpu_util(tg, cpu);
+		tg_capacity += capacity_of(cpu);
+	}
+	rcu_read_unlock();
+
+	if (tg_capacity > cpumask_weight(p->prefer_cpus) &&
+	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct)
+		p->select_cpus = p->prefer_cpus;
+}
+#endif
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -8093,11 +8207,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	int want_affine = 0;
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	int idlest_cpu = 0;
+#endif
 
 	/*
 	 * required for stable ->cpus_allowed
 	 */
 	lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_task_select_cpus(p, &idlest_cpu, sd_flag);
+#endif
+
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
 
@@ -8112,7 +8234,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 			new_cpu = prev_cpu;
 		}
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus);
+#else
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
+#endif
 	}
 
 	rcu_read_lock();
@@ -8123,7 +8249,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+			new_cpu = cpu;
+			if (cpu != prev_cpu &&
+			    cpumask_test_cpu(prev_cpu, p->select_cpus))
+#else
 			if (cpu != prev_cpu)
+#endif
 				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 
 			sd = NULL; /* Prefer wake_affine over balance flags */
@@ -8150,6 +8282,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	}
 	rcu_read_unlock();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_test_cpu(new_cpu, p->select_cpus))
+		new_cpu = idlest_cpu;
+#endif
 	return new_cpu;
 }
 
@@ -10573,8 +10709,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		int local_group;
 
 		/* Skip over this group if it has no CPUs allowed */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		if (!cpumask_intersects(sched_group_span(group),
+					p->select_cpus))
+#else
 		if (!cpumask_intersects(sched_group_span(group),
 					p->cpus_ptr))
+#endif
 			continue;
 
 		/* Skip over this group if no cookie matched */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..4dd46de2f827 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
 SCHED_FEAT(LATENCY_WARN, false)
 
 SCHED_FEAT(HZ_BW, true)
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+/*
+ * Use util_avg of bottom-Level taskgroup
+ */
+SCHED_FEAT(DA_UTIL_TASKGROUP, true)
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..80a4a5254209 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2042,6 +2042,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= SYSCTL_INT_MAX,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.procname       = "sched_util_low_pct",
+		.data           = &sysctl_sched_util_low_pct,
+		.maxlen         = sizeof(sysctl_sched_util_low_pct),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
 #endif
 	{ }
 };
-- 
Gitee


From 989744803debbff91382e5ff566bf6a1f4c048a1 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:41 +0000
Subject: [PATCH 4/9] sched: Adjust cpu allowed in load balance dynamicly

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Not allow task to migrate out of cpu preferred.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 kernel/sched/fair.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1757145bd6fd..d48ceabeb1ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9302,7 +9302,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (kthread_is_per_cpu(p))
 		return 0;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_task_select_cpus(p, NULL, 0);
+	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
+#else
 	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
+#endif
 		int cpu;
 
 		schedstat_inc(p->stats.nr_failed_migrations_affine);
@@ -9325,7 +9330,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's CPUs: */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+			if (cpumask_test_cpu(cpu, p->select_cpus)) {
+#else
 			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
+#endif
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
-- 
Gitee


From 2db0a0d7452ca644b6d6e0a37a2a186ed57478a8 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:42 +0000
Subject: [PATCH 5/9] sched: Add statistics for scheduler dynamic affinity

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 include/linux/sched.h |  5 +++++
 kernel/sched/debug.c  |  4 ++++
 kernel/sched/fair.c   | 11 +++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de8f02515715..479ee3cece5d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -543,6 +543,11 @@ struct sched_statistics {
 #ifdef CONFIG_SCHED_CORE
 	u64				core_forceidle_sum;
 #endif
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	u64				nr_wakeups_preferred_cpus;
+	u64				nr_wakeups_force_preferred_cpus;
+#endif
 #endif /* CONFIG_SCHEDSTATS */
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db6..1fe9aefc7baf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1039,6 +1039,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(nr_wakeups_affine_attempts);
 		P_SCHEDSTAT(nr_wakeups_passive);
 		P_SCHEDSTAT(nr_wakeups_idle);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		P_SCHEDSTAT(nr_wakeups_preferred_cpus);
+		P_SCHEDSTAT(nr_wakeups_force_preferred_cpus);
+#endif
 
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d48ceabeb1ff..e9f87810d95a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8173,6 +8173,8 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 		if (available_idle_cpu(cpu)) {
 			rcu_read_unlock();
 			p->select_cpus = p->prefer_cpus;
+			if (sd_flag & SD_BALANCE_WAKE)
+				schedstat_inc(p->stats.nr_wakeups_preferred_cpus);
 			return;
 		}
 
@@ -8182,8 +8184,11 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	rcu_read_unlock();
 
 	if (tg_capacity > cpumask_weight(p->prefer_cpus) &&
-	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct)
+	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) {
 		p->select_cpus = p->prefer_cpus;
+		if (sd_flag & SD_BALANCE_WAKE)
+			schedstat_inc(p->stats.nr_wakeups_preferred_cpus);
+	}
 }
 #endif
 
@@ -8283,8 +8288,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	rcu_read_unlock();
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	if (!cpumask_test_cpu(new_cpu, p->select_cpus))
+	if (!cpumask_test_cpu(new_cpu, p->select_cpus)) {
 		new_cpu = idlest_cpu;
+		schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus);
+	}
 #endif
 	return new_cpu;
 }
-- 
Gitee


From f9ed9e2802c39477704f7089a1b6e663c81bacd4 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:43 +0000
Subject: [PATCH 6/9] sched: Add cmdline for dynamic affinity

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Add cmdline 'dynamic_affinity' to control dynamic affinity feature,
which is disabled by default.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 include/linux/sched.h  |  6 ++++++
 kernel/cgroup/cpuset.c |  3 +++
 kernel/fork.c          | 11 +++++++----
 kernel/sched/core.c    |  3 +++
 kernel/sched/debug.c   |  6 ++++--
 kernel/sched/fair.c    | 13 +++++++++++++
 6 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 479ee3cece5d..fe8556ff7fb3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2484,5 +2484,11 @@ int set_prefer_cpus_ptr(struct task_struct *p,
 			const struct cpumask *new_mask);
 int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
 void sched_prefer_cpus_free(struct task_struct *p);
+
+extern struct static_key_false __dynamic_affinity_switch;
+static inline bool dynamic_affinity_enabled(void)
+{
+	return static_branch_unlikely(&__dynamic_affinity_switch);
+}
 #endif
 #endif
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 01f4ff02e7b2..cfdca8aeabda 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -858,6 +858,9 @@ static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cs == &top_cpuset)
 		return -EACCES;
 
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
 	/*
 	 * An empty prefer_cpus is ok which mean that the cpuset tasks disable
 	 * dynamic affinity feature.
diff --git a/kernel/fork.c b/kernel/fork.c
index 84d829613f6e..a1cd8930c3e1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -626,7 +626,8 @@ void free_task(struct task_struct *tsk)
 		free_kthread_struct(tsk);
 	bpf_task_storage_free(tsk);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	sched_prefer_cpus_free(tsk);
+	if (dynamic_affinity_enabled())
+		sched_prefer_cpus_free(tsk);
 #endif
 	free_task_struct(tsk);
 }
@@ -2365,9 +2366,11 @@ __latent_entropy struct task_struct *copy_process(
 	rt_mutex_init_task(p);
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
-	if (retval)
-		goto bad_fork_free;
+	if (dynamic_affinity_enabled()) {
+		retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
+		if (retval)
+			goto bad_fork_free;
+	}
 #endif
 
 	lockdep_assert_irqs_enabled();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1cebed8dae8..58c274b655ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11635,6 +11635,9 @@ static int __set_prefer_cpus_ptr(struct task_struct *p,
 	struct rq *rq;
 	int ret = 0;
 
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
 	if (unlikely(!p->prefer_cpus))
 		return -EINVAL;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1fe9aefc7baf..eee2d05dc90a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1040,8 +1040,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(nr_wakeups_passive);
 		P_SCHEDSTAT(nr_wakeups_idle);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-		P_SCHEDSTAT(nr_wakeups_preferred_cpus);
-		P_SCHEDSTAT(nr_wakeups_force_preferred_cpus);
+		if (dynamic_affinity_enabled()) {
+			P_SCHEDSTAT(nr_wakeups_preferred_cpus);
+			P_SCHEDSTAT(nr_wakeups_force_preferred_cpus);
+		}
 #endif
 
 		avg_atom = p->se.sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9f87810d95a..ae5f00dee274 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8104,6 +8104,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 }
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+
+DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch);
+
+static int __init dynamic_affinity_switch_setup(char *__unused)
+{
+	static_branch_enable(&__dynamic_affinity_switch);
+	return 1;
+}
+__setup("dynamic_affinity", dynamic_affinity_switch_setup);
+
 /*
  * Low utilization threshold for CPU
  *
@@ -8113,6 +8123,9 @@ int sysctl_sched_util_low_pct = 85;
 
 static inline bool prefer_cpus_valid(struct task_struct *p)
 {
+	if (!dynamic_affinity_enabled())
+		return false;
+
 	return p->prefer_cpus &&
 	       !cpumask_empty(p->prefer_cpus) &&
 	       !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
-- 
Gitee


From de818e491ad5a507bc569fea8024eeb772f61684 Mon Sep 17 00:00:00 2001
From: Hui Tang <tanghui20@huawei.com>
Date: Tue, 19 Dec 2023 07:56:44 +0000
Subject: [PATCH 7/9] config: enable CONFIG_QOS_SCHED_DYNAMIC_AFFINITY by
 default

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LL9S

--------------------------------

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 arch/arm64/configs/openeuler_defconfig | 1 +
 arch/x86/configs/openeuler_defconfig   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 33ba39711884..47160d1b2abf 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -167,6 +167,7 @@ CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SCHED_MM_CID=y
+CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_CGROUP_RDMA=y
 CONFIG_CGROUP_FREEZER=y
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 44040b835333..a76cc8fa6fcb 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -189,6 +189,7 @@ CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SCHED_MM_CID=y
+CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y
 CONFIG_CGROUP_PIDS=y
 CONFIG_CGROUP_RDMA=y
 CONFIG_CGROUP_FREEZER=y
-- 
Gitee


From 7ff95c5d46fa363e689b8b62f6e29551eee6d077 Mon Sep 17 00:00:00 2001
From: zhangwei123171 <zhangwei123171@jd.com>
Date: Tue, 19 Dec 2023 07:56:45 +0000
Subject: [PATCH 8/9] sched/fair: Remove invalid cpu selection logic in dynamic
 affinity

jingdong inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8PGQ2

--------------------------------

The CPU selected by  select_task_rq_fair may not be
in the corresponding cpuset. This will be corrected
in the subsequent fallback process.

dynamic affinity should not break this logic.
Fixes: f6cee1481527 ("sched: Adjust wakeup cpu range according CPU util dynamicly")
Signed-off-by: zhangwei123171 <zhangwei123171@jd.com>
Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae5f00dee274..c46504c96cfb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8226,7 +8226,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	int idlest_cpu = 0;
+	int idlest_cpu = -1;
 #endif
 
 	/*
@@ -8301,7 +8301,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	rcu_read_unlock();
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	if (!cpumask_test_cpu(new_cpu, p->select_cpus)) {
+	if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) {
 		new_cpu = idlest_cpu;
 		schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus);
 	}
-- 
Gitee


From f504d191ac6c5726617598abf544bf4a1e9d0107 Mon Sep 17 00:00:00 2001
From: zhangwei123171 <zhangwei123171@jd.com>
Date: Tue, 19 Dec 2023 07:56:46 +0000
Subject: [PATCH 9/9] sched/fair: Modify idle cpu judgment in dynamic affinity

jingdong inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8PGQ3

--------------------------------

In a co-location scenario, when an online container uses
the dynamic affinity capability, the CPU that is full of
offline tasks will also be used as an idle CPU,
which is more friendly to online tasks.

Fixes: f6cee1481527 ("sched: Adjust wakeup cpu range according CPU util dynamicly")
Signed-off-by: zhangwei123171 <zhangwei123171@jd.com>
Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c46504c96cfb..ead7a02a145c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8172,7 +8172,7 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	rcu_read_lock();
 	tg = task_group(p);
 	for_each_cpu(cpu, p->prefer_cpus) {
-		if (idlest_cpu && available_idle_cpu(cpu)) {
+		if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) {
 			*idlest_cpu = cpu;
 		} else if (idlest_cpu) {
 			spare = (long)(capacity_of(cpu) -
@@ -8183,7 +8183,7 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 			}
 		}
 
-		if (available_idle_cpu(cpu)) {
+		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) {
 			rcu_read_unlock();
 			p->select_cpus = p->prefer_cpus;
 			if (sd_flag & SD_BALANCE_WAKE)
-- 
Gitee