diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 3781d138c3e3ec96df2befaf5758c4f6dc6c286b..bb08df0cbe06885d1dfbb838da94591d34a2d401 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -161,6 +161,7 @@ CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y +CONFIG_QOS_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index e19bf53c0bd9856032515f84609faad002c6878f..bf732080d4ff2e7cd512459aef8045128621a6d0 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -183,6 +183,7 @@ CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y +CONFIG_QOS_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b307013b9c6c9a2c892644f7fba94698bb82133b..7fa51b600ee81dc1f363676b7542097b625d66cc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -855,4 +855,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ +#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk); +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf2fbf0777fdbf5e3fd993c889cee3..255372856812332854aa635956ad7a8ea9446254 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,6 +59,11 @@ static inline void resume_user_mode_work(struct pt_regs *regs) blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); + +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4412f8818386f1164a5051c7c20fd15a20028e08..3520e3fbaa916670190eea018a4a6a01f78d5010 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2458,4 +2458,15 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_QOS_SCHED +void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} +#endif + #endif diff --git a/init/Kconfig b/init/Kconfig index c94c82cc539faae9d9f8eda5ae229c8133772a24..b6952df34ec305e83c48240ed6c98bf667ad63c3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -985,6 +985,18 @@ menuconfig CGROUP_SCHED tasks. if CGROUP_SCHED +config QOS_SCHED + bool "Qos task scheduling" + depends on CGROUP_SCHED + depends on CFS_BANDWIDTH + default n + help + This option enable qos scheduler, and support co-location online + services (Latency Sensitive) and offline tasks. colocation can + effectively improve the resource utilization. + + If in doubt, say N. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 518725b57200c2fe6d1d01a2280bd5156073ae0c..534522bd5b73aff412403948972b6364a7939d8d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2936,6 +2936,26 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked ss->post_attach(); } +#ifdef CONFIG_QOS_SCHED +void cgroup_move_task_to_root(struct task_struct *tsk) +{ + struct cgroup *cpu_cgrp; + struct cgroup *cpu_root_cgrp; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + + spin_lock_irq(&css_set_lock); + cpu_cgrp = task_cgroup(tsk, cpu_cgrp_id); + cpu_root_cgrp = &cpu_cgrp->root->cgrp; + spin_unlock_irq(&css_set_lock); + + (void)cgroup_attach_task(cpu_root_cgrp, tsk, false); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); +} +#endif + static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a854b71836dd5b8a811d098a0cc24a2381e1e727..a1c73dea1f778c4038fef05cab1875335bc3dd17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7727,6 +7727,18 @@ static int __sched_setscheduler(struct task_struct *p, } change: +#ifdef CONFIG_QOS_SCHED + /* + * If the scheduling policy of an offline task is set to a policy + * other than SCHED_IDLE, the online task preemption and cpu resource + * isolation will be invalid, so return -EINVAL in this case. + */ + if (unlikely(task_group(p)->qos_level == -1 && !idle_policy(policy))) { + retval = -EINVAL; + goto unlock; + } +#endif + if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* @@ -10006,6 +10018,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -10345,6 +10360,67 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_QOS_SCHED +static inline int alloc_qos_sched_group(struct task_group *tg, + struct task_group *parent) +{ + tg->qos_level = parent->qos_level; + + return 1; +} + +static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_attr attr = {0}; + + /* + * No need to re-setcheduler when a task is exiting or the task + * is in an autogroup. + */ + if (!(tsk->flags & PF_EXITING) && + !task_group_is_autogroup(tg) && + (tg->qos_level == -1)) { + attr.sched_priority = 0; + attr.sched_policy = SCHED_IDLE; + attr.sched_nice = PRIO_TO_NICE(tsk->static_prio); + __setscheduler_params(tsk, &attr); + __setscheduler_prio(tsk, normal_prio(tsk)); + } +} + +struct offline_args { + struct work_struct work; + struct task_struct *p; +}; + +static void sched_move_work(struct work_struct *work) +{ + struct sched_param param = { .sched_priority = 0 }; + struct offline_args *args = container_of(work, struct offline_args, work); + + cgroup_move_task_to_root(args->p); + sched_setscheduler(args->p, SCHED_NORMAL, ¶m); + put_task_struct(args->p); + kfree(args); +} + +void sched_move_offline_task(struct task_struct *p) +{ + struct offline_args *args; + + if (unlikely(task_group(p)->qos_level != -1)) + return; + + args = kmalloc(sizeof(struct offline_args), GFP_ATOMIC); + if (args) { + get_task_struct(p); + args->p = p; + INIT_WORK(&args->work, sched_move_work); + queue_work(system_highpri_wq, &args->work); + } +} +#endif + static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10395,6 +10471,11 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err; +#ifdef CONFIG_QOS_SCHED + if (!alloc_qos_sched_group(tg, parent)) + goto err; +#endif + if (!alloc_rt_sched_group(tg, parent)) goto err; @@ -10481,6 +10562,10 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group { tsk->sched_task_group = group; +#ifdef CONFIG_QOS_SCHED + sched_change_qos_group(tsk, group); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11209,6 +11294,69 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_QOS_SCHED +static int tg_change_scheduler(struct task_group *tg, void *data) +{ + int policy; + struct css_task_iter it; + struct sched_param param; + struct task_struct *tsk; + s64 qos_level = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->qos_level = qos_level; + if (qos_level == -1) + policy = SCHED_IDLE; + else + policy = SCHED_NORMAL; + + param.sched_priority = 0; + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setscheduler(tsk, policy, ¶m); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level != -1 && qos_level != 0) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level == -1 && qos_level == 0) + return -EINVAL; + + cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); +done: + return 0; +} + +static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11272,6 +11420,13 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, +#endif +#ifdef CONFIG_QOS_SCHED + { + .name = "qos_level", + .read_s64 = cpu_qos_read, + .write_s64 = cpu_qos_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2430c88e04284f130a4c2b04c544baa026d4e489..8ae0d65713aad4348d4631a290a538d299813fd3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -57,6 +57,11 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_QOS_SCHED +#include +#include +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -124,6 +129,24 @@ int __weak arch_asym_cpu_priority(int cpu) #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif +#ifdef CONFIG_QOS_SCHED + +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ +static int one_thousand = 1000; +static int hundred_thousand = 100000; +static int unthrottle_qos_cfs_rqs(int cpu); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -172,6 +195,26 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + .extra2 = &one_thousand, + }, +#endif {} }; @@ -5639,6 +5682,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq)]; +#ifdef CONFIG_QOS_SCHED + /* + * if this cfs_rq throttled by qos, not need unthrottle it. + */ + if (cfs_rq->throttled == QOS_THROTTLED) + return; +#endif + cfs_rq->throttled = 0; update_rq_clock(rq); @@ -5823,7 +5874,20 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; #endif - /* By the above checks, this should never be true */ + /* + * CPU hotplug callbacks race against distribute_cfs_runtime() + * when the QOS_SCHED feature is enabled, there may be + * situations where the runtime_remaining > 0. + * Qos_sched does not care whether the cfs_rq has time left, + * so no longer allocate time to cfs_rq in this scenario. + */ +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->throttled == QOS_THROTTLED && + cfs_rq->runtime_remaining > 0) + goto next; +#endif + + /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -6191,6 +6255,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif +#ifdef CONFIG_QOS_SCHED + INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6280,6 +6347,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * the rq clock again in unthrottle_cfs_rq(). */ rq_clock_start_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -6305,6 +6375,9 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_unlock(); rq_clock_stop_loop_update(rq); +#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif } bool cfs_task_bw_constrained(struct task_struct *p) @@ -8115,6 +8188,278 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ resched_curr(rq); } +#ifdef CONFIG_QOS_SCHED +static inline bool is_offline_task(struct task_struct *p) +{ + return task_group(p)->qos_level == -1; +} + +static void start_qos_hrtimer(int cpu); + +static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } + } + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: + if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + + cfs_rq->throttled = QOS_THROTTLED; + + list_add(&cfs_rq->qos_throttled_list, + &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); +} + +static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se; + long task_delta, idle_task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + if (cfs_rq->throttled != QOS_THROTTLED) + return; + + cfs_rq->throttled = 0; + + update_rq_clock(rq); + list_del_init(&cfs_rq->qos_throttled_list); + + /* update hierarchical throttle state */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_unlock(); + + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + } + + task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + break; + + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + add_nr_running(rq, task_delta); + +unthrottle_throttle: + + assert_list_leaf_cfs_rq(rq); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); +} + +static int __unthrottle_qos_cfs_rqs(int cpu) +{ + struct cfs_rq *cfs_rq, *tmp_rq; + int res = 0; + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), + qos_throttled_list) { + if (cfs_rq_throttled(cfs_rq)) { + unthrottle_qos_cfs_rq(cfs_rq); + res++; + } + } + + return res; +} + +static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + +static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (unlikely(__this_cpu_read(qos_cpu_overload))) + return false; + + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + + if (!rq_of(cfs_rq)->online) + return false; + + throttle_qos_cfs_rq(cfs_rq); + return true; + } + + return false; +} + +static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (cfs_rq->tg->qos_level == -1 && cfs_rq_throttled(cfs_rq)) + unthrottle_qos_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +} + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || fatal_signal_pending(current)) + break; + + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (is_offline_task(p)) + set_notify_resume(p); + } +} + +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8205,6 +8550,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); +#ifdef CONFIG_QOS_SCHED + if (check_qos_cfs_rq(cfs_rq)) { + cfs_rq = &rq->cfs; + WARN(cfs_rq->nr_running == 0, + "rq->nr_running=%u, cfs_rq->idle_h_nr_running=%u\n", + rq->nr_running, cfs_rq->idle_h_nr_running); + if (unlikely(!cfs_rq->nr_running)) + return NULL; + } +#endif } while (cfs_rq); p = task_of(se); @@ -8265,6 +8620,10 @@ done: __maybe_unused; update_misfit_status(p, rq); sched_fair_update_stop_tick(rq, p); +#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + return p; idle: @@ -8284,6 +8643,14 @@ done: __maybe_unused; if (new_tasks > 0) goto again; +#ifdef CONFIG_QOS_SCHED + if (unthrottle_qos_cfs_rqs(cpu_of(rq))) { + rq->idle_stamp = 0; + goto again; + } + + __this_cpu_write(qos_cpu_overload, 0); +#endif /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -12600,6 +12967,10 @@ void free_fair_sched_group(struct task_group *tg) int i; for_each_possible_cpu(i) { +#ifdef CONFIG_QOS_SCHED + if (tg->cfs_rq && tg->cfs_rq[i]) + unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se) @@ -12989,6 +13360,11 @@ __init void init_sched_fair_class(void) #endif } +#ifdef CONFIG_QOS_SCHED + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); +#endif + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409cc00f20f24d8cc6456554d67aba0a..3de84e95baf1cbee2a0a79d9bdcdb1f4b960b103 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,6 +403,10 @@ struct task_group { struct cfs_bandwidth cfs_bandwidth; +#ifdef CONFIG_QOS_SCHED + long qos_level; +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsigned int uclamp_pct[UCLAMP_CNT]; @@ -649,6 +653,10 @@ struct cfs_rq { #endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ + +#if defined(CONFIG_QOS_SCHED) + struct list_head qos_throttled_list; +#endif }; static inline int rt_bandwidth_enabled(void) @@ -1403,6 +1411,10 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) +#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d6690a50e0a6931a20fcfdb6fb15c04b..28cddef3977869bea3b97fb756aaafa594a7f4d1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1060,6 +1060,9 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->group_stop_count = 0; t = p; do { +#ifdef CONFIG_QOS_SCHED + sched_move_offline_task(t); +#endif task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1);