From 229f5c1f386887d8706d99cb98a8def7d2f20a08 Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:16 +0800 Subject: [PATCH 1/6] sched: Introduce dynamic affinity for cfs scheduler hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Dynamic affinity set preferred cpus for task. When the utilization of taskgroup's preferred cpu is low, task only run in cpus preferred to enhance cpu resource locality and reduce interference between task cgroups, otherwise task can burst preferred cpus to use external cpu within cpus allowed. Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- init/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 85225fd2dfe4..52115d61b88f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1024,6 +1024,16 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED +config QOS_SCHED_DYNAMIC_AFFINITY + bool "qos dynamic affinity" + depends on CPUSETS + default n + help + This feature lets you allocate preferred cpus to taskgroup. If enabled, + it will make taskgroup only to use preferred cpus when cpu utilization + of taskgroup is below threshold setted, otherwise make taskgroup to use + cpus allowed. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED -- Gitee From ebeb84ad373d442f875b7a6909e64dbac82a0e91 Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:17 +0800 Subject: [PATCH 2/6] cpuset: Introduce new interface for scheduler dynamic affinity hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Add 'prefer_cpus' sysfs and related interface in cgroup cpuset. Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- fs/proc/base.c | 73 ++++++++++++++++++++ include/linux/sched.h | 11 +++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 152 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 6 files changed, 346 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 98bfd18e61bc..fe64234ce264 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3251,6 +3251,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +static int preferred_cpuset_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->prefer_cpus) + seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus)); + else + seq_putc(m, '\n'); + + put_task_struct(p); + + return 0; +} + +static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + cpumask_var_t new_mask; + int retval; + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + + retval = cpumask_parselist_user(buf, count, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = set_prefer_cpus_ptr(p, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = count; + +out_free_cpumask: + free_cpumask_var(new_mask); +out_put_task: + put_task_struct(p); + + return retval; +} + +static int preferred_cpuset_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, preferred_cpuset_show, inode); +} + +static const struct file_operations proc_preferred_cpuset_operations = { + .open = preferred_cpuset_open, + .write = preferred_cpuset_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3820,6 +3890,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_BPF_SCHED REG("tag", 0644, proc_pid_tag_operations), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c6e8c5183fb..685c813b8850 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1422,7 +1422,11 @@ struct task_struct { KABI_RESERVE(6) #endif KABI_USE(7, void *pf_io_worker) +#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__) + KABI_USE(8, cpumask_t *prefer_cpus) +#else KABI_RESERVE(8) +#endif KABI_RESERVE(9) KABI_RESERVE(10) KABI_RESERVE(11) @@ -2206,6 +2210,13 @@ static inline int sched_qos_cpu_overload(void) } #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); +#endif + #ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); diff --git a/init/init_task.c b/init/init_task.c index 49418809de26..fa8838c2c203 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -214,6 +214,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 60489cc5a92a..637e01c91e36 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -107,6 +107,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -193,6 +196,9 @@ struct cpuset { struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -472,15 +478,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t *pmask4; +#endif if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &cs->prefer_cpus; +#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &tmp->prefer_cpus; +#endif } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -491,9 +506,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; +#endif return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three: + free_cpumask_var(*pmask3); +#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -509,11 +532,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(tmp->prefer_cpus); +#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -537,6 +566,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; @@ -580,6 +612,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; cpuset_for_each_child(c, css, cur) @@ -644,6 +681,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -2229,6 +2326,10 @@ static void cpuset_attach(struct cgroup_taskset *tset) else guarantee_online_cpus(cs, cpus_attach); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); +#endif + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { @@ -2237,6 +2338,9 @@ static void cpuset_attach(struct cgroup_taskset *tset) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -2297,6 +2401,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2427,6 +2534,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -2474,6 +2586,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -2681,7 +2798,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ }; @@ -2830,6 +2955,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); @@ -2912,6 +3040,9 @@ static void cpuset_fork(struct task_struct *task) return; set_cpus_allowed_ptr(task, current->cpus_ptr); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; } @@ -2945,17 +3076,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif return 0; } @@ -2992,6 +3132,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty; spin_lock_irq(&callback_lock); @@ -3010,6 +3153,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); diff --git a/kernel/fork.c b/kernel/fork.c index 908c4e2e7896..6592d68d98ce 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -468,6 +468,9 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + sched_prefer_cpus_free(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -929,6 +932,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2047,6 +2054,12 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; +#endif + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b67d65bfddce..ec25e19f0912 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9763,6 +9763,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (mask) + cpumask_copy(p->prefer_cpus, mask); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->__lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, p->cpus_ptr)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask, false); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { -- Gitee From 2a3bb3c0af290b87555ec338ded2ab27120feb5c Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:18 +0800 Subject: [PATCH 3/6] sched: Adjust wakeup cpu range according CPU util dynamicly hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Compare taskgroup 'util_avg' in perferred cpu with capacity preferred cpu, dynamicly adjust cpu range for task wakeup process. Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- include/linux/sched.h | 3 +- include/linux/sched/sysctl.h | 4 ++ kernel/sched/fair.c | 131 +++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++ 4 files changed, 148 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 685c813b8850..e91f6ea282a3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1424,10 +1424,11 @@ struct task_struct { KABI_USE(7, void *pf_io_worker) #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__) KABI_USE(8, cpumask_t *prefer_cpus) + KABI_USE(9, const cpumask_t *select_cpus) #else KABI_RESERVE(8) -#endif KABI_RESERVE(9) +#endif KABI_RESERVE(10) KABI_RESERVE(11) KABI_RESERVE(12) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0a25112d83b5..0acfc0cb0456 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,6 +31,10 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +extern int sysctl_sched_util_low_pct; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe08832d38d8..c70a2776a569 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6092,7 +6092,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { +#else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { +#endif struct rq *rq = cpu_rq(i); if (!sched_core_cookie_match(rq, p)) @@ -6139,7 +6143,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) +#else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) +#endif return prev_cpu; /* @@ -6266,7 +6274,11 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif *idle_cpu = cpu; break; } @@ -6322,7 +6334,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#endif if (sched_feat(SIS_PROP) && !smt) { u64 avg_cost, avg_idle, span_avg; @@ -6460,6 +6476,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(target, p->select_cpus) && +#endif asym_fits_capacity(task_util, target)) { SET_STAT(found_idle_cpu_easy); return target; @@ -6470,6 +6489,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(prev, p->select_cpus) && +#endif asym_fits_capacity(task_util, prev)) { SET_STAT(found_idle_cpu_easy); return prev; @@ -6498,7 +6520,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && +#else cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && +#endif asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6921,6 +6947,82 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return -1; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && + cpumask_subset(p->prefer_cpus, p->cpus_ptr); +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + * @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select + * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, + int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + p->select_cpus = p->cpus_ptr; + if (!prefer_cpus_valid(p)) + return; + + rcu_read_lock(); + tg = task_group(p); + for_each_cpu(cpu, p->prefer_cpus) { + if (unlikely(!tg->se[cpu])) + continue; + + if (idlest_cpu && available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + return; + } + + util_avg_sum += tg->se[cpu]->avg.util_avg; + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + if (tg_capacity > cpumask_weight(p->prefer_cpus) && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) + p->select_cpus = p->prefer_cpus; +} +#endif + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6946,6 +7048,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f cpumask_t *cpus; int ret; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + int idlest_cpu = 0; +#endif time = schedstat_start_time(); @@ -6953,6 +7058,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * required for stable ->cpus_allowed */ lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, &idlest_cpu, sd_flag); +#endif + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); @@ -6963,7 +7073,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); +#else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); +#endif } rcu_read_lock(); @@ -7000,7 +7114,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + new_cpu = cpu; + if (cpu != prev_cpu && + cpumask_test_cpu(prev_cpu, p->select_cpus)) +#else if (cpu != prev_cpu) +#endif new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); sd = NULL; /* Prefer wake_affine over balance flags */ @@ -7038,6 +7158,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f #endif rcu_read_unlock(); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + new_cpu = idlest_cpu; +#endif + schedstat_end_time(cpu_rq(cpu), time); return new_cpu; @@ -9712,8 +9838,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_group_span(group), + p->select_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue; /* Skip over this group if no cookie matched */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7919119e89b2..60079815aafd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2728,6 +2728,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "sched_util_low_pct", + .data = &sysctl_sched_util_low_pct, + .maxlen = sizeof(sysctl_sched_util_low_pct), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, #endif { } }; -- Gitee From bcddff7edab1f07bb2824b9dfc7380db995ddccf Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:19 +0800 Subject: [PATCH 4/6] sched: Adjust cpu allowed in load balance dynamicly hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Not allow task to migrate out of cpu preferred. Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c70a2776a569..49e2886c4c6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8512,7 +8512,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, NULL, 0); + if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { +#else if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { +#endif int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -8532,7 +8537,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; -- Gitee From e287d8eb6889a8c7381bb41f52ad2af5ff256179 Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:20 +0800 Subject: [PATCH 5/6] sched: Add statistics for scheduler dynamic affinity hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- include/linux/sched.h | 6 ++++++ kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 11 +++++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e91f6ea282a3..6abf68d8f1b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -468,9 +468,15 @@ struct sched_statistics { KABI_RESERVE(1) KABI_RESERVE(2) #endif + +#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__) + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#else KABI_RESERVE(3) KABI_RESERVE(4) #endif +#endif }; struct sched_entity { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dda6e77accc2..00f01518bbdd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -977,6 +977,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_qos_smt_expelled); #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + P_SCHEDSTAT(se.statistics.nr_wakeups_preferred_cpus); + P_SCHEDSTAT(se.statistics.nr_wakeups_force_preferred_cpus); +#endif avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49e2886c4c6b..c2c1f8f6c12d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7009,6 +7009,8 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, if (available_idle_cpu(cpu)) { rcu_read_unlock(); p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.statistics.nr_wakeups_preferred_cpus); return; } @@ -7018,8 +7020,11 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, rcu_read_unlock(); if (tg_capacity > cpumask_weight(p->prefer_cpus) && - util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.statistics.nr_wakeups_preferred_cpus); + } } #endif @@ -7160,8 +7165,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_unlock(); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) { new_cpu = idlest_cpu; + schedstat_inc(p->se.statistics.nr_wakeups_force_preferred_cpus); + } #endif schedstat_end_time(cpu_rq(cpu), time); -- Gitee From 9ef80982c5d09042212210927c43592b60aa83c1 Mon Sep 17 00:00:00 2001 From: tanghui Date: Wed, 16 Nov 2022 15:12:21 +0800 Subject: [PATCH 6/6] config: enable CONFIG_QOS_SCHED_DYNAMIC_AFFINITY by default hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Signed-off-by: tanghui Signed-off-by: Zheng Zucheng Reviewed-by: Zhang Qiao --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 07c49be426c7..f6cc13c8971d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -139,6 +139,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_V1_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y # CONFIG_QOS_SCHED_SMT_EXPELLER is not set CONFIG_FAIR_GROUP_SCHED=y CONFIG_QOS_SCHED_PRIO_LB=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4c0ff69b14bc..56f5f9845f9c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -158,6 +158,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_V1_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y # CONFIG_QOS_SCHED_SMT_EXPELLER is not set CONFIG_FAIR_GROUP_SCHED=y CONFIG_QOS_SCHED_PRIO_LB=y -- Gitee