diff --git a/include/linux/sched.h b/include/linux/sched.h index fa83018137ce3bf73565d30bb4b25e5335dacea7..7ae971fee08941e8eacdd8893b756dfb7ed77389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -522,7 +522,11 @@ struct sched_entity { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_SCHED_STEAL + KABI_USE(2, int steal_task) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) }; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 09214349bddfcc98609be2e2ad64cb21a0030865..9f998be56bdd888583c719494d34e06f40dcedc4 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,10 @@ extern int sysctl_sched_util_low_pct; extern int sysctl_sched_util_ratio; #endif +#ifdef CONFIG_SCHED_STEAL +extern int sysctl_sched_max_steal_count; +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID extern unsigned int sysctl_smart_grid_strategy_ctrl; extern int sysctl_affinity_adjust_delay_ms; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 770d3e7ace4fac29292f7883eb053ea0c789555c..18f0dc8275d39ab6fb05c24e49ea90eeb8aa77d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8275,6 +8275,9 @@ void __init sched_init(void) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER root_task_group.smt_expell = TG_SMT_EXPELL; #endif +#ifdef CONFIG_SCHED_STEAL + root_task_group.steal_task = TG_STEAL_NO; +#endif #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -8636,13 +8639,6 @@ static inline int alloc_qos_sched_group(struct task_group *tg, #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER tg->smt_expell = parent->smt_expell; #endif - tg->qos_level_mutex = kzalloc(sizeof(struct mutex), GFP_KERNEL); - - if (!tg->qos_level_mutex) - return 0; - - mutex_init(tg->qos_level_mutex); - return 1; } @@ -8719,6 +8715,20 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +#ifdef CONFIG_SCHED_STEAL +static void sched_change_steal_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_entity *se = &tsk->se; + + se->steal_task = tg->steal_task; +} + +static inline void tg_init_steal(struct task_group *tg, struct task_group *ptg) +{ + tg->steal_task = ptg->steal_task; +} +#endif + #ifdef CONFIG_BPF_SCHED static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) { @@ -8746,6 +8756,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_SCHED_STEAL + tg_init_steal(tg, parent); +#endif + #ifdef CONFIG_BPF_SCHED tg_init_tag(tg, parent); #endif @@ -8821,6 +8835,10 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif +#ifdef CONFIG_SCHED_STEAL + sched_change_steal_group(tsk, tg); +#endif + #ifdef CONFIG_BPF_SCHED /* * This function has cleared and restored the task status, @@ -9727,7 +9745,6 @@ static int tg_change_scheduler(struct task_group *tg, void *data) s64 qos_level = *(s64 *)data; struct cgroup_subsys_state *css = &tg->css; - mutex_lock(tg->qos_level_mutex); tg->qos_level = qos_level; if (is_offline_level(qos_level)) policy = SCHED_IDLE; @@ -9745,7 +9762,6 @@ static int tg_change_scheduler(struct task_group *tg, void *data) sched_setscheduler(tsk, policy, ¶m); } css_task_iter_end(&it); - mutex_unlock(tg->qos_level_mutex); return 0; } @@ -9796,6 +9812,87 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_STEAL +static DEFINE_MUTEX(steal_mutex); + +static inline s64 cpu_steal_task_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->steal_task; +} + +void sched_setsteal(struct task_struct *tsk, s64 steal_task) +{ + struct sched_entity *se = &tsk->se; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (se->steal_task == steal_task) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + se->steal_task = steal_task; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_steal(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 steal_task = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->steal_task = steal_task; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setsteal(tsk, steal_task); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_steal_task_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 steal_task) +{ + struct task_group *tg = css_tg(css); + + if (!group_steal_used()) + return -EPERM; + + if (steal_task < TG_STEAL_NO || steal_task > TG_STEAL) + return -EINVAL; + + mutex_lock(&steal_mutex); + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_steal, tg_nop, (void *)(&steal_task)); + rcu_read_unlock(); + + mutex_unlock(&steal_mutex); + + return 0; +} +#endif + #ifdef CONFIG_BPF_SCHED void sched_settag(struct task_struct *tsk, s64 tag) { @@ -9966,6 +10063,14 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_smt_expell_write, }, #endif +#ifdef CONFIG_SCHED_STEAL + { + .name = "steal_task", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_steal_task_read, + .write_s64 = cpu_steal_task_write, + }, +#endif #ifdef CONFIG_BPF_SCHED { .name = "tag", diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5233ba9fdc697d776246bee1b96e81d207e045fd..ff2d6dc59c14c9b4e40e87dcf39c3c06454173d8 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -594,6 +594,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +#ifdef CONFIG_SCHED_STEAL + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "steal_h_nr_running", + cfs_rq->steal_h_nr_running); +#endif SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e47766bc5910890410bc4f70c21619680a07863..38841d1d640a744da5258be9ff0bc5c67b6d561c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4459,6 +4459,14 @@ static inline void rq_idle_stamp_clear(struct rq *rq) } #ifdef CONFIG_SCHED_STEAL +DEFINE_STATIC_KEY_FALSE(group_steal); + +static int __init group_steal_setup(char *__unused) +{ + static_branch_enable(&group_steal); + return 1; +} +__setup("group_steal", group_steal_setup); static inline bool steal_enabled(void) { @@ -4470,14 +4478,30 @@ static inline bool steal_enabled(void) return sched_feat(STEAL) && allow; } +static inline bool group_steal_enabled(int steal_task) +{ + return group_steal_used() && is_tg_steal(steal_task); +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; unsigned long time; + bool need_clear = false; if (!steal_enabled()) return; + if (!group_steal_used() && rq->cfs.h_nr_running >= 2) + return; + + if (group_steal_used() && + (rq->cfs.h_nr_running < 2 || rq->cfs.steal_h_nr_running == 0)) + need_clear = true; + + if (!need_clear) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -4495,6 +4519,12 @@ static void overload_set(struct rq *rq) if (!steal_enabled()) return; + if (rq->cfs.h_nr_running < 2) + return; + + if (group_steal_used() && rq->cfs.steal_h_nr_running < 1) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -5278,13 +5308,15 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5319,6 +5351,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5338,6 +5373,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif if (qcfs_rq->load.weight) dequeue = 0; @@ -5345,8 +5383,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif } /* @@ -5361,13 +5400,15 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5399,6 +5440,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -5410,6 +5455,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5427,6 +5475,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5442,8 +5493,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif unthrottle_throttle: /* @@ -6576,8 +6628,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); - unsigned int prev_nr = rq->cfs.h_nr_running; - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running; @@ -6612,6 +6665,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6632,6 +6689,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6647,8 +6708,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - if (prev_nr == 1) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif /* * Since new tasks are assigned an initial util_avg equal to @@ -6707,9 +6769,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); - unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running = se->qos_idle ? 1 : 0; @@ -6727,6 +6790,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6759,6 +6826,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6768,8 +6839,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); - if (prev_nr == 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -8543,10 +8615,12 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; +#endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; #endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -8560,6 +8634,10 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8578,6 +8656,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif if (qcfs_rq->load.weight) dequeue = 0; @@ -8585,9 +8666,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); - +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif } if (!qos_timer_is_activated(cpu_of(rq))) @@ -8603,11 +8684,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq)]; @@ -8632,6 +8715,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -8644,6 +8731,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8660,6 +8750,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8673,8 +8767,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) } add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif unthrottle_throttle: /* @@ -9842,10 +9937,14 @@ static bool can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) { int dst_cpu = dst_rq->cpu; + struct task_group *tg = task_group(p); lockdep_assert_rq_held(rq); - if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + if (group_steal_used() && !is_tg_steal(tg->steal_task)) + return false; + + if (throttled_lb_pair(tg, cpu_of(rq), dst_cpu)) return false; if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { @@ -13084,6 +13183,7 @@ void trigger_load_balance(struct rq *rq) } #ifdef CONFIG_SCHED_STEAL +int sysctl_sched_max_steal_count = 32; /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -13095,14 +13195,20 @@ detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) int dst_cpu = dst_rq->cpu; struct task_struct *p; struct rq *rq = rq_of(cfs_rq); + int count = 1; lockdep_assert_rq_held(rq_of(cfs_rq)); list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (count > sysctl_sched_max_steal_count) + break; + if (can_migrate_task_llc(p, rq, dst_rq)) { detach_task(p, rq, dst_cpu); return p; } + + count++; } return NULL; } @@ -13122,10 +13228,14 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, int stolen = 0; int dst_cpu = dst_rq->cpu; struct rq *src_rq = cpu_rq(src_cpu); + bool tg_used = group_steal_used(); if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) return 0; + if (tg_used && src_rq->cfs.steal_h_nr_running < 1) + return 0; + if (*locked) { rq_unpin_lock(dst_rq, dst_rf); raw_spin_rq_unlock(dst_rq); @@ -13134,7 +13244,8 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, rq_lock_irqsave(src_rq, &rf); update_rq_clock(src_rq); - if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + if (!cpu_active(src_cpu) || src_rq->cfs.h_nr_running < 2 || + (tg_used && src_rq->cfs.steal_h_nr_running < 1)) p = NULL; else p = detach_next_task(&src_rq->cfs, dst_rq); @@ -13691,9 +13802,6 @@ void free_fair_sched_group(struct task_group *tg) kfree(tg->se[i]); } -#ifdef CONFIG_QOS_SCHED - kfree(tg->qos_level_mutex); -#endif kfree(tg->cfs_rq); kfree(tg->se); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14d48f6380fa533411c64753a177066f8fca1d8e..fe6342305b0f3ca5efef5ee19dfcf7fe032e2d83 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,7 +402,6 @@ struct cfs_bandwidth { #endif }; - #ifdef CONFIG_QOS_SCHED_SMART_GRID #define AD_LEVEL_MAX 8 @@ -497,11 +496,13 @@ struct task_group { #else KABI_RESERVE(2) #endif -#ifdef CONFIG_QOS_SCHED - KABI_USE(3, struct mutex *qos_level_mutex) + +#ifdef CONFIG_SCHED_STEAL + KABI_USE(3, int steal_task) #else KABI_RESERVE(3) #endif + #if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) KABI_USE(4, struct auto_affinity *auto_affinity) #else @@ -509,6 +510,18 @@ struct task_group { #endif }; +#ifdef CONFIG_SCHED_STEAL +enum tg_steal_task { + TG_STEAL_NO = 0, + TG_STEAL = 1, +}; + +static inline bool is_tg_steal(int steal_task) +{ + return steal_task == TG_STEAL; +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD @@ -710,12 +723,13 @@ struct cfs_rq { unsigned int forceidle_seq; KABI_FILL_HOLE(unsigned int kabi_hole) u64 min_vruntime_fi; -#elif defined CONFIG_QOS_SCHED_SMT_EXPELLER && !defined(__GENKSYMS__) +#elif (defined(CONFIG_QOS_SCHED_SMT_EXPELLER) || \ + defined(CONFIG_SCHED_STEAL)) && !defined(__GENKSYMS__) union { unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ unsigned long qos_idle_h_nr_running_padding; }; - KABI_FILL_HOLE(unsigned long kabi_hole) + unsigned long steal_h_nr_running; #else KABI_RESERVE(3) KABI_RESERVE(4) @@ -1779,6 +1793,15 @@ extern void set_sched_cluster(void); static inline void set_sched_cluster(void) { } #endif +#ifdef CONFIG_SCHED_STEAL +DECLARE_STATIC_KEY_FALSE(group_steal); + +static inline bool group_steal_used(void) +{ + return static_branch_unlikely(&group_steal); +} +#endif + #ifdef CONFIG_NUMA #ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e7413d6dd75b88ee50d0ef655ed15598b582fd86..4bf575e4e7fcc56f090266b81c866e624f60e00f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1885,7 +1885,6 @@ static void init_numa_topology_type(void) #ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; -#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 static int __init steal_node_limit_setup(char *buf) { @@ -1900,7 +1899,7 @@ static void check_node_limit(void) int n = num_possible_nodes(); if (sched_steal_node_limit == 0) - sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + sched_steal_node_limit = n; if (n > sched_steal_node_limit) { static_branch_disable(&sched_steal_allow); pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bd7b17be9ba44449b80d319651b32d3ad63932b8..0d4d83da4b302ce8c1b81d7215cbf801cfc78f35 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -131,6 +131,9 @@ static int hundred_thousand = 100000; #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; #endif +#ifdef CONFIG_SCHED_STEAL +static int max_steal_count = 1024; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2827,6 +2830,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, }, #endif +#ifdef CONFIG_SCHED_STEAL + { + .procname = "sched_max_steal_count", + .data = &sysctl_sched_max_steal_count, + .maxlen = sizeof(sysctl_sched_max_steal_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &max_steal_count, + }, +#endif #ifdef CONFIG_QOS_SCHED_SMART_GRID { .procname = "smart_grid_strategy_ctrl",