diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 33ba397118848b915123f43d26e03439cda4af23..47160d1b2abf5e67fb6aaab777a3af130ec52460 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -167,6 +167,7 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 44040b835333c8a85fa7472f393ecc3275fcaea0..a76cc8fa6fcb7dbc04cb17ca8a57dc7021ead26c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -189,6 +189,7 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c35478e92a9f6bef67013e16e6cd3183..243c15919e1839c0ed20641832c87ad9345fee56 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +static int preferred_cpuset_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->prefer_cpus) + seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus)); + else + seq_putc(m, '\n'); + + put_task_struct(p); + + return 0; +} + +static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + cpumask_var_t new_mask; + int retval; + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + + retval = cpumask_parselist_user(buf, count, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = set_prefer_cpus_ptr(p, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = count; + +out_free_cpumask: + free_cpumask_var(new_mask); +out_put_task: + put_task_struct(p); + + return retval; +} + +static int preferred_cpuset_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, preferred_cpuset_show, inode); +} + +static const struct file_operations proc_preferred_cpuset_operations = { + .open = preferred_cpuset_open, + .write = preferred_cpuset_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3691,6 +3761,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3520e3fbaa916670190eea018a4a6a01f78d5010..fe8556ff7fb347330d5bc36d6f9cf0a0111dca2e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -543,6 +543,11 @@ struct sched_statistics { #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; @@ -1537,6 +1542,11 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t *prefer_cpus; + const cpumask_t *select_cpus; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -2469,4 +2479,16 @@ static inline int sched_qos_cpu_overload(void) } #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); + +extern struct static_key_false __dynamic_affinity_switch; +static inline bool dynamic_affinity_enabled(void) +{ + return static_branch_unlikely(&__dynamic_affinity_switch); +} +#endif #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5a64582b086b2864c454642df8257ecde39d394a..ede157a678f8beda5691ef2866e051726b7bb165 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -29,4 +29,7 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +extern int sysctl_sched_util_low_pct; +#endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/init/Kconfig b/init/Kconfig index 2ee1384c4f81e09a3f66d3898aa685d91f94f711..9bae5fd6641ee27f35f9d1560f87bbf0173462e7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1031,6 +1031,16 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED +config QOS_SCHED_DYNAMIC_AFFINITY + bool "qos dynamic affinity" + depends on CPUSETS + default n + help + This feature lets you allocate preferred cpus to taskgroup. If enabled, + it will make taskgroup only to use preferred cpus when cpu utilization + of taskgroup is below threshold setted, otherwise make taskgroup to use + cpus allowed. + config SCHED_MM_CID def_bool y depends on SMP && RSEQ diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b1c01c0d99d605d5f0c22693470d3..ac0c5850f74bb4c3d569c120999c81c9231decff 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4749e0c86c62c317a1a3bc6dc992ff97d0f2b4f2..cfdca8aeabda2256ac4d2773e7d36bbbae949688 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -115,6 +115,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -212,6 +215,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t *pmask4; +#endif if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &cs->prefer_cpus; +#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &tmp->prefer_cpus; +#endif } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; +#endif return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three: + free_cpumask_var(*pmask3); +#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(tmp->prefer_cpus); +#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; @@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif + par = parent_cs(cur); /* @@ -791,6 +829,69 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + if (!dynamic_affinity_enabled()) + return -EPERM; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -2655,6 +2756,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2762,6 +2867,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2892,6 +3000,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -2939,6 +3052,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -3161,7 +3279,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ }; @@ -3327,6 +3453,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); @@ -3480,6 +3609,9 @@ static void cpuset_fork(struct task_struct *task) return; set_cpus_allowed_ptr(task, current->cpus_ptr); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; return; } @@ -3526,17 +3658,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif return 0; } @@ -3573,6 +3714,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty; spin_lock_irq(&callback_lock); @@ -3591,6 +3735,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); diff --git a/kernel/fork.c b/kernel/fork.c index 391d81cf0943abc5f49f77a8f341062adbb253d3..a1cd8930c3e10b73a6eeb742b0f9399bd581f03e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -625,6 +625,10 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) + sched_prefer_cpus_free(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1139,6 +1143,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2357,6 +2365,14 @@ __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) { + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; + } +#endif + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1c73dea1f778c4038fef05cab1875335bc3dd17..58c274b655ab5ff4cce31466bb23fd3a3db6d03d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11570,6 +11570,104 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (mask) + cpumask_copy(p->prefer_cpus, mask); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->__lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (!dynamic_affinity_enabled()) + return -EPERM; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, p->cpus_ptr)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6326703f92aab707771c39921a543..eee2d05dc90afdb76c261042ee953d5d6b894f75 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1039,6 +1039,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_affine_attempts); P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (dynamic_affinity_enabled()) { + P_SCHEDSTAT(nr_wakeups_preferred_cpus); + P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); + } +#endif avg_atom = p->se.sum_exec_runtime; if (nr_switches) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0de55884f9dadb7fc976824b16c1ddfd054bf668..ead7a02a145ca0da27c028bbcb0bb5d21c8636e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7086,7 +7086,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { +#else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { +#endif struct rq *rq = cpu_rq(i); if (!sched_core_cookie_match(rq, p)) @@ -7133,7 +7137,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) +#else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) +#endif return prev_cpu; /* @@ -7257,7 +7265,11 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif *idle_cpu = cpu; break; } @@ -7283,7 +7295,11 @@ static int select_idle_smt(struct task_struct *p, int target) { int cpu; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(cpu, cpu_smt_mask(target), p->select_cpus) { +#else for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { +#endif if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) @@ -7331,7 +7347,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain *this_sd = NULL; u64 time = 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#endif if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -7504,6 +7524,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(target, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7512,6 +7535,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(prev, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, prev)) return prev; @@ -7538,7 +7564,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(recent_used_cpu, p->select_cpus) && +#else cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && +#endif asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } @@ -8073,6 +8103,108 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return target; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch); + +static int __init dynamic_affinity_switch_setup(char *__unused) +{ + static_branch_enable(&__dynamic_affinity_switch); + return 1; +} +__setup("dynamic_affinity", dynamic_affinity_switch_setup); + +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + if (!dynamic_affinity_enabled()) + return false; + + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && + cpumask_subset(p->prefer_cpus, p->cpus_ptr); +} + +static inline unsigned long taskgroup_cpu_util(struct task_group *tg, + int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tg->se[cpu] && sched_feat(DA_UTIL_TASKGROUP)) + return tg->se[cpu]->avg.util_avg; +#endif + return cpu_util_cfs(cpu); +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + * @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'prefer_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'prefer_cpus' capacity, select + * 'prefer_cpus' range for task, otherwise select 'cpus_ptr' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, + int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + p->select_cpus = p->cpus_ptr; + if (!prefer_cpus_valid(p)) + return; + + rcu_read_lock(); + tg = task_group(p); + for_each_cpu(cpu, p->prefer_cpus) { + if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - + taskgroup_cpu_util(tg, cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + return; + } + + util_avg_sum += taskgroup_cpu_util(tg, cpu); + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + if (tg_capacity > cpumask_weight(p->prefer_cpus) && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + } +} +#endif + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, @@ -8093,11 +8225,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int want_affine = 0; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + int idlest_cpu = -1; +#endif /* * required for stable ->cpus_allowed */ lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, &idlest_cpu, sd_flag); +#endif + if (wake_flags & WF_TTWU) { record_wakee(p); @@ -8112,7 +8252,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = prev_cpu; } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); +#else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); +#endif } rcu_read_lock(); @@ -8123,7 +8267,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + new_cpu = cpu; + if (cpu != prev_cpu && + cpumask_test_cpu(prev_cpu, p->select_cpus)) +#else if (cpu != prev_cpu) +#endif new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); sd = NULL; /* Prefer wake_affine over balance flags */ @@ -8150,6 +8300,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } rcu_read_unlock(); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) { + new_cpu = idlest_cpu; + schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus); + } +#endif return new_cpu; } @@ -9166,7 +9322,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, NULL, 0); + if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { +#else if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { +#endif int cpu; schedstat_inc(p->stats.nr_failed_migrations_affine); @@ -9189,7 +9350,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -10573,8 +10738,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; /* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_group_span(group), + p->select_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue; /* Skip over this group if no cookie matched */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae4a09dd0f240957c0c7d749001a50..4dd46de2f827d5ea79c633ec7be88a6143fb420d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,3 +89,10 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) SCHED_FEAT(HZ_BW, true) + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Use util_avg of bottom-Level taskgroup + */ +SCHED_FEAT(DA_UTIL_TASKGROUP, true) +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f526ad6688168443913385eda101fa1..80a4a52542090b9a301509601b8a48b210ce65b4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2042,6 +2042,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, +#endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "sched_util_low_pct", + .data = &sysctl_sched_util_low_pct, + .maxlen = sizeof(sysctl_sched_util_low_pct), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, #endif { } };