From bac93bf18e45157397c6586727a80482e30c49be Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:01 -0700 Subject: [PATCH 1/3] sched: Account number of SCHED_IDLE entities on each cfs_rq ANBZ: #6587 commit a480addecc0d89c200ec0b41da62ae8ceddca8d7 upstream. Adds cfs_rq->idle_nr_running, which accounts the number of idle entities directly enqueued on the cfs_rq. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-3-joshdon@google.com Signed-off-by: Tianchen Ding --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 25 ++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d30ea76022e..6e0606bdcd53 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -596,6 +596,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", + cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", cfs_rq->idle_h_nr_running); #ifdef CONFIG_GROUP_IDENTITY diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 719c6e071d25..b0aa0c2f9e46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4757,6 +4757,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; + if (se_is_idle(se)) + cfs_rq->idle_nr_running++; } static void @@ -4774,6 +4776,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; + if (se_is_idle(se)) + cfs_rq->idle_nr_running--; } /* @@ -7444,6 +7448,17 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +/* + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use + * of idle_nr_running, which does not consider idle descendants of normal + * entities. + */ +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +{ + return cfs_rq->nr_running && + cfs_rq->nr_running == cfs_rq->idle_nr_running; +} + #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -13624,7 +13639,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13635,6 +13650,14 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; + if (se->on_rq) { + parent_cfs_rq = cfs_rq_of(se); + if (cfs_rq_is_idle(grp_cfs_rq)) + parent_cfs_rq->idle_nr_running++; + else + parent_cfs_rq->idle_nr_running--; + } + idle_task_delta = grp_cfs_rq->h_nr_running - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 448c3fd61680..e11c212b48c6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -663,6 +663,7 @@ struct cfs_rq { struct load_weight load; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; -- Gitee From 6aa027538fd5860c48fb9ea6bf4692c66135a674 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:02 -0700 Subject: [PATCH 2/3] sched: reduce sched slice for SCHED_IDLE entities ANBZ: #6587 commit 51ce83ed523b00d58f2937ec014b12daaad55185 upstream. Use a small, non-scaled min granularity for SCHED_IDLE entities, when competing with normal entities. This reduces the latency of getting a normal entity back on cpu, at the expense of increased context switch frequency of SCHED_IDLE entities. The benefit of this change is to reduce the round-robin latency for normal entities when competing with a SCHED_IDLE entity. Example: on a machine with HZ=1000, spawned two threads, one of which is SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE thread runs for 4ms then waits for 1.4s. With this patch, it runs for 1ms and waits 340ms (as it round-robins with the other thread). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-4-joshdon@google.com [dtcccc: adapt "idle_min_granularity_ns" to sysctl instead of debugfs because kernel 5.10 has not moved them yet. I set the min value of this config to ZERO to suppress BE tasks better.] Signed-off-by: Tianchen Ding --- include/linux/sched/sysctl.h | 1 + kernel/sched/debug.c | 1 + kernel/sched/fair.c | 29 ++++++++++++++++++++++++----- kernel/sysctl.c | 9 +++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index f2139e3d7e47..4034ca45fcf0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -28,6 +28,7 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_idle_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6e0606bdcd53..5a5330bfb34a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -820,6 +820,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); + PN(sysctl_sched_idle_min_granularity); PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b0aa0c2f9e46..8379550fbca9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -59,6 +59,14 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +/* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. + * Applies only when SCHED_IDLE tasks compete with normal tasks. + * + * (default: 0.75 msec) + */ +unsigned int sysctl_sched_idle_min_granularity = 750000ULL; + /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ @@ -2381,6 +2389,8 @@ static u64 __sched_period(unsigned long nr_running) return sysctl_sched_latency; } +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); + /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. @@ -2390,6 +2400,8 @@ static u64 __sched_period(unsigned long nr_running) static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned int nr_running = cfs_rq->nr_running; + struct sched_entity *init_se = se; + unsigned int min_gran; u64 slice; if (sched_feat(ALT_PERIOD)) @@ -2400,12 +2412,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; + struct cfs_rq *qcfs_rq; - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + qcfs_rq = cfs_rq_of(se); + load = &qcfs_rq->load; if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + lw = qcfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; @@ -2413,8 +2426,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) slice = __calc_delta(slice, se->load.weight, load); } - if (sched_feat(BASE_SLICE)) - slice = max(slice, (u64)sysctl_sched_min_granularity); + if (sched_feat(BASE_SLICE)) { + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) + min_gran = sysctl_sched_idle_min_granularity; + else + min_gran = sysctl_sched_min_granularity; + + slice = max_t(u64, slice, min_gran); + } return slice; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1ad829a6beb1..cf8fe3b39b94 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1778,6 +1778,15 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_idle_min_granularity_ns", + .data = &sysctl_sched_idle_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &max_sched_granularity_ns, + }, { .procname = "sched_latency_ns", .data = &sysctl_sched_latency, -- Gitee From 4d3ac5d03b02050afad1e5df79b3c3035cc9537f Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:03 -0700 Subject: [PATCH 3/3] sched: adjust sleeper credit for SCHED_IDLE entities ANBZ: #6587 commit 2cae3948edd488ebdef4deaf1d1043f92f47e665 upstream. Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken SCHED_IDLE entities will take longer to preempt normal entities. The benefit of this change is to make it less likely that a newly woken SCHED_IDLE entity will preempt a short-running normal entity before it blocks. We still give a small sleeper credit to SCHED_IDLE entities, so that idle<->idle competition retains some fairness. Example: With HZ=1000, spawned four threads affined to one cpu, one of which was set to SCHED_IDLE. Without this patch, wakeup latency for the SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was ~5ms. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Jiang Biao Link: https://lore.kernel.org/r/20210820010403.946838-5-joshdon@google.com Signed-off-by: Tianchen Ding --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8379550fbca9..4356310a3f99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5993,7 +5993,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* sleeps up to a single latency don't count. */ if (!initial) { - unsigned long thresh = sysctl_sched_latency; + unsigned long thresh; + + if (se_is_idle(se)) + thresh = sysctl_sched_min_granularity; + else + thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow -- Gitee