From 493f855d8a63755dc327faa0204ae4a8b757b340 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:26 +0000 Subject: [PATCH 1/8] sched: Introduce qos smt expeller for co-location hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA -------------------------------- We introduce the qos smt expeller, which lets online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources.In this way we are able to improve QOS of online tasks in co-location. Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai Signed-off-by: Xia Fukun --- init/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 87941b608911..f57943f7b6d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1001,6 +1001,15 @@ config QOS_SCHED If in doubt, say N. +config QOS_SCHED_SMT_EXPELLER + bool "Qos smt expeller" + depends on SCHED_SMT + depends on QOS_SCHED + default n + help + This feature enable online tasks to expel offline tasks + on the smt sibling cpus, and exclusively occupy CPU resources. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED -- Gitee From 37d0f2389d0cbc549bf64348d647d4f24ccb3a3f Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:27 +0000 Subject: [PATCH 2/8] sched: Implement the function of qos smt expeller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA -------------------------------- We implement the function of qos smt expeller by this following two points: a) when online tasks and offline tasks are running on the same physical cpu, online tasks will send ipi to expel offline tasks on the smt sibling cpus. b) when online tasks are running, the smt sibling cpus will not allow offline tasks to be selected. Adapted to openEuler-6.4. Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai Signed-off-by: Xia Fukun --- include/linux/sched.h | 7 ++ kernel/sched/fair.c | 189 +++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 5 ++ 3 files changed, 199 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 73ddff132493..b97ca0a9fd08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2012,9 +2012,16 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); __get_task_comm(buf, sizeof(buf), tsk); \ }) +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +void qos_smt_check_need_resched(void); +#endif + #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_check_need_resched(); +#endif /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9af04551788..3626c6af3556 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,10 @@ #include #endif +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +#include +#endif + /* * Targeted preemption latency for CPU-bound tasks: * @@ -190,6 +194,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static DEFINE_PER_CPU(int, qos_smt_status); +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -8523,6 +8531,131 @@ static void qos_schedule_throttle(struct task_struct *p) #endif +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_check_siblings_status(int this_cpu) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE) + return true; + } + + return false; +} + +static bool qos_smt_expelled(int this_cpu) +{ + /* + * The qos_smt_status of siblings cpu is online, and current cpu only has + * offline tasks enqueued, there is not suitable task, + * so pick_next_task_fair return null. + */ + if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + return true; + + return false; +} + +static bool qos_smt_update_status(struct task_struct *p) +{ + int status = QOS_LEVEL_OFFLINE; + + if (p != NULL && task_group(p)->qos_level >= QOS_LEVEL_ONLINE) + status = QOS_LEVEL_ONLINE; + + if (__this_cpu_read(qos_smt_status) == status) + return false; + + __this_cpu_write(qos_smt_status, status); + + return true; +} + +static void qos_smt_send_ipi(int this_cpu) +{ + int cpu; + struct rq *rq = NULL; + + if (!sched_smt_active()) + return; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + /* + * There are two cases where current don't need to send scheduler_ipi: + * a) The qos_smt_status of siblings cpu is online; + * b) The cfs.h_nr_running of siblings cpu is 0. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE || + rq->cfs.h_nr_running == 0) + continue; + + smp_send_reschedule(cpu); + } +} + +static void qos_smt_expel(int this_cpu, struct task_struct *p) +{ + if (qos_smt_update_status(p)) + qos_smt_send_ipi(this_cpu); +} + +static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) +{ + int cpu; + + if (!sched_smt_active()) + return false; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + /* + * There are two cases rely on the set need_resched to drive away + * offline task: + * a) The qos_smt_status of siblings cpu is online, the task of current cpu is offline; + * b) The qos_smt_status of siblings cpu is offline, the task of current cpu is idle, + * and current cpu only has SCHED_IDLE tasks enqueued. + */ + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && + task_group(current)->qos_level < QOS_LEVEL_ONLINE) + return true; + + if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + return true; + } + + return false; +} + +void qos_smt_check_need_resched(void) +{ + struct rq *rq = this_rq(); + int this_cpu = rq->cpu; + + if (test_tsk_need_resched(current)) + return; + + if (_qos_smt_check_need_resched(this_cpu, rq)) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } +} +#endif + #ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { @@ -8563,14 +8696,30 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int this_cpu = rq->cpu; +#endif again: +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + if (qos_smt_expelled(this_cpu)) { + __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + return NULL; + } +#endif + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) - goto simple; + if (!prev || prev->sched_class != &fair_sched_class) { +#ifdef CONFIG_QOS_SCHED + if (cfs_rq->idle_h_nr_running != 0 && rq->online) + goto qos_simple; + else +#endif + goto simple; + } /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather @@ -8654,6 +8803,34 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf } goto done; + +#ifdef CONFIG_QOS_SCHED +qos_simple: + if (prev) + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); + if (check_qos_cfs_rq(group_cfs_rq(se))) { + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + goto idle; + continue; + } + + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + while (se) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + + goto done; +#endif + simple: #endif if (prev) @@ -8686,6 +8863,10 @@ done: __maybe_unused; qos_schedule_throttle(p); #endif +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, p); +#endif + return p; idle: @@ -8719,6 +8900,10 @@ done: __maybe_unused; */ update_idle_rq_clock_pelt(rq); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_smt_expel(this_cpu, NULL); +#endif + return NULL; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4e65a5e3009..19069d2b050d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1412,6 +1412,11 @@ do { \ } while (0) #ifdef CONFIG_QOS_SCHED +enum task_qos_level { + QOS_LEVEL_OFFLINE = -1, + QOS_LEVEL_ONLINE = 0, + QOS_LEVEL_MAX +}; void init_qos_hrtimer(int cpu); #endif -- Gitee From 66bf52e07235bb5736c410e89246bb61bae4ed29 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:28 +0000 Subject: [PATCH 3/8] sched: Add statistics for qos smt expeller hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA -------------------------------- We have added two statistics for qos smt expeller: a) nr_qos_smt_send_ipi:the times of ipi which online task expel offline tasks; b) nr_qos_smt_expelled:the statistics that offline task will not be picked times. Adapted to 6.4 kernel. Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai Signed-off-by: Xia Fukun --- include/linux/sched.h | 6 ++++++ kernel/sched/debug.c | 6 ++++++ kernel/sched/fair.c | 2 ++ 3 files changed, 14 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b97ca0a9fd08..86278255c494 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,12 @@ struct sched_statistics { u64 nr_wakeups_preferred_cpus; u64 nr_wakeups_force_preferred_cpus; #endif + +#if defined(CONFIG_QOS_SCHED_SMT_EXPELLER) && !defined(__GENKSYMS__) + u64 nr_qos_smt_send_ipi; + u64 nr_qos_smt_expelled; +#endif + #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 940e191d7722..37e71fdd879d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1049,6 +1049,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, } #endif +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + P_SCHEDSTAT(nr_qos_smt_send_ipi); + P_SCHEDSTAT(nr_qos_smt_expelled); +#endif + + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3626c6af3556..603625e1c974 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8601,6 +8601,7 @@ static void qos_smt_send_ipi(int this_cpu) rq->cfs.h_nr_running == 0) continue; + schedstat_inc(current->stats.nr_qos_smt_send_ipi); smp_send_reschedule(cpu); } } @@ -8704,6 +8705,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); return NULL; } #endif -- Gitee From f6f61e7c1adaea99aa6862e2a9357bc7f2138bd5 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:29 +0000 Subject: [PATCH 4/8] sched: Add tracepoint for qos smt expeller hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR CVE: NA -------------------------------- There are two caces that we add tracepoint: a) while online task of sibling cpu is running, it is running that offline task of local cpu will be set TIF_NEED_RESCHED; b) while online task of sibling cpu is running, it will expell that next picked offline task of local cpu. Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai Signed-off-by: Xia Fukun --- include/trace/events/sched.h | 55 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 9 ++++-- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..4bafb70dfafc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -268,6 +268,61 @@ TRACE_EVENT(sched_switch, __entry->next_comm, __entry->next_pid, __entry->next_prio) ); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +/* + * Tracepoint for a offline task being resched: + */ +TRACE_EVENT(sched_qos_smt_expel, + + TP_PROTO(struct task_struct *sibling_p, int qos_smt_status), + + TP_ARGS(sibling_p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, sibling_comm, TASK_COMM_LEN ) + __field( pid_t, sibling_pid ) + __field( int, sibling_qos_status ) + __field( int, sibling_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->sibling_comm, sibling_p->comm, TASK_COMM_LEN); + __entry->sibling_pid = sibling_p->pid; + __entry->sibling_qos_status = qos_smt_status; + __entry->sibling_cpu = task_cpu(sibling_p); + ), + + TP_printk("sibling_comm=%s sibling_pid=%d sibling_qos_status=%d sibling_cpu=%d", + __entry->sibling_comm, __entry->sibling_pid, __entry->sibling_qos_status, + __entry->sibling_cpu) +); + +/* + * Tracepoint for a offline task being expelled: + */ +TRACE_EVENT(sched_qos_smt_expelled, + + TP_PROTO(struct task_struct *p, int qos_smt_status), + + TP_ARGS(p, qos_smt_status), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, qos_status ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->qos_status = qos_smt_status; + ), + + TP_printk("comm=%s pid=%d qos_status=%d", + __entry->comm, __entry->pid, __entry->qos_status) +); +#endif + /* * Tracepoint for a task being migrated: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 603625e1c974..799ffa8eea89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8631,12 +8631,16 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) * and current cpu only has SCHED_IDLE tasks enqueued. */ if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_ONLINE && - task_group(current)->qos_level < QOS_LEVEL_ONLINE) + task_group(current)->qos_level < QOS_LEVEL_ONLINE) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + } if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) + rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; + } } return false; @@ -8706,6 +8710,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf if (qos_smt_expelled(this_cpu)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); + trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; } #endif -- Gitee From 8c78eeca4574ce0a2391acff7325454367985e39 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:30 +0000 Subject: [PATCH 5/8] config: enable CONFIG_QOS_SCHED_SMT_EXPELLER by hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I52611 CVE: NA Signed-off-by: Guan Jing Reviewed-by: Chen Hui Signed-off-by: Zheng Zengkai Signed-off-by: Xia Fukun --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d187c7c6b84b..c1079475eb47 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -163,6 +163,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 9c216e1104ef..2ce540ce01ec 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -186,6 +186,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_CFS_BANDWIDTH=y CONFIG_QOS_SCHED=y CONFIG_RT_GROUP_SCHED=y -- Gitee From 064fcd12ef369d0beeb7aa5c09d156819f472c7b Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:32 +0000 Subject: [PATCH 6/8] sched/fair: Start tracking qos_offline tasks count in cfs_rq hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR ------------------------------- Track how many tasks are present with qos_offline_policy in each cfs_rq. This will be used by later commits. Signed-off-by: Guan Jing Signed-off-by: Xia Fukun --- kernel/sched/fair.c | 82 +++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 22 ++++++++++++ 2 files changed, 99 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 799ffa8eea89..cf439444b073 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5461,6 +5461,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5492,6 +5495,10 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -5505,6 +5512,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5527,6 +5537,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif } /* At this point se is NULL and we are at root level*/ @@ -5548,6 +5561,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5587,6 +5603,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5599,6 +5618,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5616,6 +5638,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6366,6 +6391,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif int task_new = !(flags & ENQUEUE_WAKEUP); /* @@ -6392,6 +6420,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6412,7 +6443,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6465,6 +6498,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + int qos_idle_h_nr_running = task_has_qos_idle_policy(p); +#endif bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -6475,6 +6511,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6507,7 +6546,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; +#endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -8276,7 +8317,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; - +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; /* freeze hierarchy runnable averages while throttled */ @@ -8286,6 +8329,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8296,6 +8342,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -8318,6 +8367,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; +#endif } /* At this point se is NULL and we are at root level*/ @@ -8338,6 +8390,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + long qos_idle_delta; +#endif se = cfs_rq->tg->se[cpu_of(rq)]; @@ -8370,6 +8425,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + qos_idle_delta = cfs_rq->qos_idle_h_nr_running; +#endif for_each_sched_entity(se) { if (se->on_rq) break; @@ -8379,6 +8437,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8392,6 +8453,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + cfs_rq->qos_idle_h_nr_running += qos_idle_delta; +#endif /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -8550,6 +8614,14 @@ static bool qos_smt_check_siblings_status(int this_cpu) return false; } +static bool qos_sched_idle_cpu(int this_cpu) +{ + struct rq *rq = cpu_rq(this_cpu); + + return unlikely(rq->nr_running == rq->cfs.qos_idle_h_nr_running && + rq->nr_running); +} + static bool qos_smt_expelled(int this_cpu) { /* @@ -8557,7 +8629,7 @@ static bool qos_smt_expelled(int this_cpu) * offline tasks enqueued, there is not suitable task, * so pick_next_task_fair return null. */ - if (qos_smt_check_siblings_status(this_cpu) && sched_idle_cpu(this_cpu)) + if (qos_smt_check_siblings_status(this_cpu) && qos_sched_idle_cpu(this_cpu)) return true; return false; @@ -8637,7 +8709,7 @@ static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) } if (per_cpu(qos_smt_status, cpu) == QOS_LEVEL_OFFLINE && - rq->curr == rq->idle && sched_idle_cpu(this_cpu)) { + rq->curr == rq->idle && qos_sched_idle_cpu(this_cpu)) { trace_sched_qos_smt_expel(cpu_curr(cpu), per_cpu(qos_smt_status, cpu)); return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 19069d2b050d..0d981063bf48 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -664,6 +664,14 @@ struct cfs_rq { #if defined(CONFIG_QOS_SCHED) struct list_head qos_throttled_list; #endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER + union { + unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ + unsigned long qos_idle_h_nr_running_padding; + }; +#endif + }; static inline int rt_bandwidth_enabled(void) @@ -3256,6 +3264,20 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } #endif +#ifdef CONFIG_QOS_SCHED +static inline int qos_idle_policy(int policy) +{ + return policy == QOS_LEVEL_OFFLINE; +} +#endif + +#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +static inline int task_has_qos_idle_policy(struct task_struct *p) +{ + return qos_idle_policy(task_group(p)->qos_level) && p->policy == SCHED_IDLE; +} +#endif + extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -- Gitee From d8e7cc940fff12ba8a931febbb4b6d0eb2b0cc58 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:33 +0000 Subject: [PATCH 7/8] sched/fair: Introduce QOS_SMT_EXPELL priority reversion mechanism hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7YRZR ------------------------------- Here is the typical case that priority inversion will caused occasionally by SMT expelling: Assuming that there are two SMT cores-cA and cB, online tasks are running on cA while offline tasks on cB. With SMT expelling, online task will drives off offline tasks to occupy all SMT cores exclusively, which, in turn, will starve the offline task to release the related resources other tasks with higher priority need. Hence, this patch will introduce another mechanism to alleviate this situation. For all offline tasks, one metric of profiling the maximum task expelling duration is set up and the default value is 5 seconds, if such offline task exists, all offline tasks will be allowed to run into one small sleep(msleep) loop in kernel before they goes into usermode; and further, if the two SMT cores(such as cA and cB) are idle or don't get any online tasks to run, for these offline tasks, they will continue to run in usermode for the next schedule. Adapt to 6.4 kernel. Signed-off-by: Guan Jing Signed-off-by: Xia Fukun --- kernel/sched/fair.c | 46 +++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf439444b073..4d174f21ceb8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -192,6 +192,7 @@ static DEFINE_PER_CPU(int, qos_cpu_overload); unsigned int sysctl_overload_detect_period = 5000; /* in ms */ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); +static bool qos_smt_expelled(int this_cpu); #endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8304,6 +8305,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ } #ifdef CONFIG_QOS_SCHED +static inline bool qos_timer_is_activated(int cpu) +{ + return hrtimer_active(per_cpu_ptr(&qos_overload_timer, cpu)); +} + +static inline void cancel_qos_timer(int cpu) +{ + hrtimer_cancel(per_cpu_ptr(&qos_overload_timer, cpu)); +} static inline bool is_offline_task(struct task_struct *p) { @@ -8376,7 +8386,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) sub_nr_running(rq, task_delta); done: - if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + if (!qos_timer_is_activated(cpu_of(rq))) start_qos_hrtimer(cpu_of(rq)); cfs_rq->throttled = QOS_THROTTLED; @@ -8467,10 +8477,6 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) unthrottle_throttle: assert_list_leaf_cfs_rq(rq); - - /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); } static int __unthrottle_qos_cfs_rqs(int cpu) @@ -8492,11 +8498,10 @@ static int __unthrottle_qos_cfs_rqs(int cpu) static int unthrottle_qos_cfs_rqs(int cpu) { int res; - res = __unthrottle_qos_cfs_rqs(cpu); - if (res) - hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + if (qos_timer_is_activated(cpu) && !qos_smt_expelled(cpu)) + cancel_qos_timer(cpu); return res; } @@ -8552,8 +8557,13 @@ static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) struct rq *rq = this_rq(); rq_lock_irqsave(rq, &rf); - if (__unthrottle_qos_cfs_rqs(smp_processor_id())) - __this_cpu_write(qos_cpu_overload, 1); + __unthrottle_qos_cfs_rqs(smp_processor_id()); + __this_cpu_write(qos_cpu_overload, 1); + + /* Determine whether we need to wake up potentially idle CPU. */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_curr(rq); + rq_unlock_irqrestore(rq, &rf); return HRTIMER_NORESTART; @@ -8593,6 +8603,13 @@ static void qos_schedule_throttle(struct task_struct *p) } } +#ifndef CONFIG_QOS_SCHED_SMT_EXPELLER +static bool qos_smt_expelled(int this_cpu) +{ + return false; +} +#endif + #endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -8779,8 +8796,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf again: #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - if (qos_smt_expelled(this_cpu)) { + if (qos_smt_expelled(this_cpu) && !__this_cpu_read(qos_cpu_overload)) { __this_cpu_write(qos_smt_status, QOS_LEVEL_OFFLINE); + + if (!qos_timer_is_activated(this_cpu)) + start_qos_hrtimer(this_cpu); + schedstat_inc(rq->curr->stats.nr_qos_smt_expelled); trace_sched_qos_smt_expelled(rq->curr, per_cpu(qos_smt_status, this_cpu)); return NULL; @@ -8971,7 +8992,8 @@ done: __maybe_unused; goto again; } - __this_cpu_write(qos_cpu_overload, 0); + if (!qos_smt_expelled(cpu_of(rq))) + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the -- Gitee From 16c0701b6220c27793bc922c80312c9cf23072b3 Mon Sep 17 00:00:00 2001 From: Xia Fukun Date: Tue, 5 Sep 2023 10:58:34 +0000 Subject: [PATCH 8/8] sched/fair: Add cmdline nosmtexpell hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6SIY2 ------------------------------- Add cmdline nosmtexpell to disable qos_smt_expell when we want to close. Signed-off-by: Guan Jing Signed-off-by: Xia Fukun --- kernel/sched/fair.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4d174f21ceb8..c5a6f72212d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8613,6 +8613,15 @@ static bool qos_smt_expelled(int this_cpu) #endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch); + +static int __init qos_sched_smt_noexpell_setup(char *__unused) +{ + static_branch_disable(&qos_smt_expell_switch); + return 1; +} +__setup("nosmtexpell", qos_sched_smt_noexpell_setup); + static bool qos_smt_check_siblings_status(int this_cpu) { int cpu; @@ -8641,6 +8650,9 @@ static bool qos_sched_idle_cpu(int this_cpu) static bool qos_smt_expelled(int this_cpu) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + /* * The qos_smt_status of siblings cpu is online, and current cpu only has * offline tasks enqueued, there is not suitable task, @@ -8697,15 +8709,29 @@ static void qos_smt_send_ipi(int this_cpu) static void qos_smt_expel(int this_cpu, struct task_struct *p) { + if (!static_branch_likely(&qos_smt_expell_switch)) + return; + if (qos_smt_update_status(p)) qos_smt_send_ipi(this_cpu); } +static inline bool qos_smt_enabled(void) +{ + if (!static_branch_likely(&qos_smt_expell_switch)) + return false; + + if (!sched_smt_active()) + return false; + + return true; +} + static bool _qos_smt_check_need_resched(int this_cpu, struct rq *rq) { int cpu; - if (!sched_smt_active()) + if (!qos_smt_enabled()) return false; for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { -- Gitee