From f7818da0f7268010f8c80e68f202611a56a9067f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Sun, 28 May 2023 18:59:30 +0800
Subject: [PATCH 1/8] sched/pelt: Continue to relax the sync of util_sum with
 util_avg

mainline inclusion
from mainline-v5.17-rc2
commit 7ceb77103001544a43e11d7f3a8a69a2c1f422cf
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=7ceb77103001544a43e11d7f3a8a69a2c1f422cf

--------------------------------

Rick reported performance regressions in bugzilla because of cpu frequency
being lower than before:
    https://bugzilla.kernel.org/show_bug.cgi?id=215045

He bisected the problem to:
commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent")

This commit forces util_sum to be synced with the new util_avg after
removing the contribution of a task and before the next periodic sync. By
doing so util_sum is rounded to its lower bound and might lost up to
LOAD_AVG_MAX-1 of accumulated contribution which has not yet been
reflected in util_avg.

update_tg_cfs_util() is not the only place where we round util_sum and
lost some accumulated contributions that are not already reflected in
util_avg. Modify update_tg_cfs_util() and detach_entity_load_avg() to not
sync util_sum with the new util_avg. Instead of always setting util_sum to
the low bound of util_avg, which can significantly lower the utilization,
we propagate the difference. In addition, we also check that cfs's util_sum
always stays above the lower bound for a given util_avg as it has been
observed that sched_entity's util_sum is sometimes above cfs one.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Link: https://lkml.kernel.org/r/20220111134659.24961-3-vincent.guittot@linaro.org
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 kernel/sched/fair.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 32ef2dab6708..4f962da19dcd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3449,11 +3449,11 @@ void set_task_rq_fair(struct sched_entity *se,
 static inline void
 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
-	u32 divider;
+	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+	u32 new_sum, divider;
 
 	/* Nothing to update */
-	if (!delta)
+	if (!delta_avg)
 		return;
 
 	/*
@@ -3462,13 +3462,20 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 	 */
 	divider = get_pelt_divider(&cfs_rq->avg);
 
+
 	/* Set new sched_entity's utilization */
 	se->avg.util_avg = gcfs_rq->avg.util_avg;
-	se->avg.util_sum = se->avg.util_avg * divider;
+	new_sum = se->avg.util_avg * divider;
+	delta_sum = (long)new_sum - (long)se->avg.util_sum;
+	se->avg.util_sum = new_sum;
 
 	/* Update parent cfs_rq utilization */
-	add_positive(&cfs_rq->avg.util_avg, delta);
-	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+	add_positive(&cfs_rq->avg.util_avg, delta_avg);
+	add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
 }
 
 static inline void
@@ -3785,7 +3792,11 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
 	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
 	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
 
-- 
Gitee


From 30123568226f87145bfa5cb4f083b01fb77994b3 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Sun, 28 May 2023 18:59:31 +0800
Subject: [PATCH 2/8] sched/pelt: Relax the sync of runnable_sum with
 runnable_avg

mainline inclusion
from mainline-v5.17-rc2
commit 95246d1ec80b8d19d882cd8eb7ad094e63b41bb8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=95246d1ec80b8d19d882cd8eb7ad094e63b41bb8

--------------------------------

Similarly to util_avg and util_sum, don't sync runnable_sum with the low
bound of runnable_avg but only ensure that runnable_sum stays in the
correct range.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Link: https://lkml.kernel.org/r/20220111134659.24961-4-vincent.guittot@linaro.org
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f962da19dcd..c10694caf56e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3481,11 +3481,11 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
-	u32 divider;
+	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+	u32 new_sum, divider;
 
 	/* Nothing to update */
-	if (!delta)
+	if (!delta_avg)
 		return;
 
 	/*
@@ -3496,11 +3496,16 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 
 	/* Set new sched_entity's runnable */
 	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
-	se->avg.runnable_sum = se->avg.runnable_avg * divider;
+	new_sum = se->avg.runnable_avg * divider;
+	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+	se->avg.runnable_sum = new_sum;
 
 	/* Update parent cfs_rq runnable */
-	add_positive(&cfs_rq->avg.runnable_avg, delta);
-	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
 }
 
 static inline void
@@ -3697,7 +3702,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 		r = removed_runnable;
 		sub_positive(&sa->runnable_avg, r);
-		sa->runnable_sum = sa->runnable_avg * divider;
+		sub_positive(&sa->runnable_sum, r * divider);
+		/* See sa->util_sum above */
+		sa->runnable_sum = max_t(u32, sa->runnable_sum,
+					      sa->runnable_avg * PELT_MIN_DIVIDER);
 
 		/*
 		 * removed_runnable is the unweighted version of removed_load so we
@@ -3784,12 +3792,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	/*
-	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
-	 * See ___update_load_avg() for details.
-	 */
-	u32 divider = get_pelt_divider(&cfs_rq->avg);
-
 	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
@@ -3798,7 +3800,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
 
 	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
-	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
 
 	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
-- 
Gitee


From 928668d209e4e9c62d2dc54ec21482ec822b756a Mon Sep 17 00:00:00 2001
From: Guan Jing <guanjing6@huawei.com>
Date: Sun, 28 May 2023 18:59:32 +0800
Subject: [PATCH 3/8] sched/pelt: Relax the sync of load_sum with load_avg

mainline inclusion
from mainline-v5.17-rc2
commit 2d02fa8cc21a93da35cfba462bf8ab87bf2db651
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=2d02fa8cc21a93da35cfba462bf8ab87bf2db651

--------------------------------

Similarly to util_avg and util_sum, don't sync load_sum with the low
bound of load_avg but only ensure that load_sum stays in the correct range.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Link: https://lkml.kernel.org/r/20220111134659.24961-5-vincent.guittot@linaro.org
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 kernel/sched/fair.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c10694caf56e..28308bf66825 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3084,6 +3084,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
 }
 #else
 static inline void
@@ -3511,9 +3514,10 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 static inline void
 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
 	unsigned long load_avg;
 	u64 load_sum = 0;
+	s64 delta_sum;
 	u32 divider;
 
 	if (!runnable_sum)
@@ -3540,7 +3544,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 		 * assuming all tasks are equally runnable.
 		 */
 		if (scale_load_down(gcfs_rq->load.weight)) {
-			load_sum = div_s64(gcfs_rq->avg.load_sum,
+			load_sum = div_u64(gcfs_rq->avg.load_sum,
 				scale_load_down(gcfs_rq->load.weight));
 		}
 
@@ -3557,16 +3561,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
 	runnable_sum = max(runnable_sum, running_sum);
 
-	load_sum = (s64)se_weight(se) * runnable_sum;
-	load_avg = div_s64(load_sum, divider);
+	load_sum = se_weight(se) * runnable_sum;
+	load_avg = div_u64(load_sum, divider);
+
+	delta_avg = load_avg - se->avg.load_avg;
+	if (!delta_avg)
+		return;
 
-	delta = load_avg - se->avg.load_avg;
+	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
 
 	se->avg.load_sum = runnable_sum;
 	se->avg.load_avg = load_avg;
-
-	add_positive(&cfs_rq->avg.load_avg, delta);
-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+	add_positive(&cfs_rq->avg.load_avg, delta_avg);
+	add_positive(&cfs_rq->avg.load_sum, delta_sum);
+	/* See update_cfs_rq_load_avg() */
+	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
 }
 
 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3682,7 +3692,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 		r = removed_load;
 		sub_positive(&sa->load_avg, r);
-		sa->load_sum = sa->load_avg * divider;
+		sub_positive(&sa->load_sum, r * divider);
+		/* See sa->util_sum below */
+		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
 
 		r = removed_util;
 		sub_positive(&sa->util_avg, r);
-- 
Gitee


From 8f6acf75c4f9fee72385e909f8af9f71c0497657 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Sun, 28 May 2023 18:59:33 +0800
Subject: [PATCH 4/8] sched/fair: Improve consistency of allowed NUMA balance
 calculations

mainline inclusion
from mainline-v5.18-rc1
commit 2cfb7a1b031b0e816af7a6ee0c6ab83b0acdf05a
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=2cfb7a1b031b0e816af7a6ee0c6ab83b0acdf05a

--------------------------------

There are inconsistencies when determining if a NUMA imbalance is allowed
that should be corrected.

o allow_numa_imbalance changes types and is not always examining
  the destination group so both the type should be corrected as
  well as the naming.
o find_idlest_group uses the sched_domain's weight instead of the
  group weight which is different to find_busiest_group
o find_busiest_group uses the source group instead of the destination
  which is different to task_numa_find_cpu
o Both find_idlest_group and find_busiest_group should account
  for the number of running tasks if a move was allowed to be
  consistent with task_numa_find_cpu

Fixes: 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA nodes")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-2-mgorman@techsingularity.net
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 kernel/sched/fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28308bf66825..3b0cacfa0372 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9993,9 +9993,10 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool
+allow_numa_imbalance(unsigned int running, unsigned int weight)
 {
-	return (dst_running < (dst_weight >> 2));
+	return (running < (weight >> 2));
 }
 
 /*
@@ -10134,12 +10135,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 				return idlest;
 #endif
 			/*
-			 * Otherwise, keep the task on this node to stay close
-			 * its wakeup source and improve locality. If there is
-			 * a real need of migration, periodic load balance will
-			 * take care of it.
+			 * Otherwise, keep the task close to the wakeup source
+			 * and improve locality if the number of running tasks
+			 * would remain below threshold where an imbalance is
+			 * allowed. If there is a real need of migration,
+			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
 				return NULL;
 		}
 
@@ -10433,7 +10435,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				busiest->sum_nr_running, busiest->group_weight);
+				local->sum_nr_running + 1, local->group_weight);
 		}
 
 		return;
-- 
Gitee


From edd5e1effd36972e2b1910661daef5e4249fc3fd Mon Sep 17 00:00:00 2001
From: Guan Jing <guanjing6@huawei.com>
Date: Sun, 28 May 2023 18:59:34 +0800
Subject: [PATCH 5/8] sched/fair: Adjust the allowed NUMA imbalance when
 SD_NUMA spans multiple LLCs

mainline inclusion
from mainline-v5.18-rc1
commit e496132ebedd870b67f1f6d2428f9bb9d7ae27fd
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=e496132ebedd870b67f1f6d2428f9bb9d7ae27fd

--------------------------------

Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.

Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
allows an imbalance to exist up to the point where LLCs should be balanced
between nodes.

On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are

                            5.17.0-rc0             5.17.0-rc0
                               vanilla       sched-numaimb-v6
MB/sec copy-16    162596.94 (   0.00%)   580559.74 ( 257.05%)
MB/sec scale-16   136901.28 (   0.00%)   374450.52 ( 173.52%)
MB/sec add-16     157300.70 (   0.00%)   564113.76 ( 258.62%)
MB/sec triad-16   151446.88 (   0.00%)   564304.24 ( 272.61%)

STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.

Coremark is a CPU and cache intensive benchmark parallelised with
threads. When running with 1 thread per core, the vanilla kernel
allows threads to contend on cache. With the patch;

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v5
Min       Score-16   368239.36 (   0.00%)   389816.06 (   5.86%)
Hmean     Score-16   388607.33 (   0.00%)   427877.08 *  10.11%*
Max       Score-16   408945.69 (   0.00%)   481022.17 (  17.62%)
Stddev    Score-16    15247.04 (   0.00%)    24966.82 ( -63.75%)
CoeffVar  Score-16        3.92 (   0.00%)        5.82 ( -48.48%)

It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed. Even in cases where
the average performance is neutral, the results are more stable.

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v6
Hmean     tput-1      71631.55 (   0.00%)    73065.57 (   2.00%)
Hmean     tput-8     582758.78 (   0.00%)   556777.23 (  -4.46%)
Hmean     tput-16   1020372.75 (   0.00%)  1009995.26 (  -1.02%)
Hmean     tput-24   1416430.67 (   0.00%)  1398700.11 (  -1.25%)
Hmean     tput-32   1687702.72 (   0.00%)  1671357.04 (  -0.97%)
Hmean     tput-40   1798094.90 (   0.00%)  2015616.46 *  12.10%*
Hmean     tput-48   1972731.77 (   0.00%)  2333233.72 (  18.27%)
Hmean     tput-56   2386872.38 (   0.00%)  2759483.38 (  15.61%)
Hmean     tput-64   2909475.33 (   0.00%)  2925074.69 (   0.54%)
Hmean     tput-72   2585071.36 (   0.00%)  2962443.97 (  14.60%)
Hmean     tput-80   2994387.24 (   0.00%)  3015980.59 (   0.72%)
Hmean     tput-88   3061408.57 (   0.00%)  3010296.16 (  -1.67%)
Hmean     tput-96   3052394.82 (   0.00%)  2784743.41 (  -8.77%)
Hmean     tput-104  2997814.76 (   0.00%)  2758184.50 (  -7.99%)
Hmean     tput-112  2955353.29 (   0.00%)  2859705.09 (  -3.24%)
Hmean     tput-120  2889770.71 (   0.00%)  2764478.46 (  -4.34%)
Hmean     tput-128  2871713.84 (   0.00%)  2750136.73 (  -4.23%)
Stddev    tput-1       5325.93 (   0.00%)     2002.53 (  62.40%)
Stddev    tput-8       6630.54 (   0.00%)    10905.00 ( -64.47%)
Stddev    tput-16     25608.58 (   0.00%)     6851.16 (  73.25%)
Stddev    tput-24     12117.69 (   0.00%)     4227.79 (  65.11%)
Stddev    tput-32     27577.16 (   0.00%)     8761.05 (  68.23%)
Stddev    tput-40     59505.86 (   0.00%)     2048.49 (  96.56%)
Stddev    tput-48    168330.30 (   0.00%)    93058.08 (  44.72%)
Stddev    tput-56    219540.39 (   0.00%)    30687.02 (  86.02%)
Stddev    tput-64    121750.35 (   0.00%)     9617.36 (  92.10%)
Stddev    tput-72    223387.05 (   0.00%)    34081.13 (  84.74%)
Stddev    tput-80    128198.46 (   0.00%)    22565.19 (  82.40%)
Stddev    tput-88    136665.36 (   0.00%)    27905.97 (  79.58%)
Stddev    tput-96    111925.81 (   0.00%)    99615.79 (  11.00%)
Stddev    tput-104   146455.96 (   0.00%)    28861.98 (  80.29%)
Stddev    tput-112    88740.49 (   0.00%)    58288.23 (  34.32%)
Stddev    tput-120   186384.86 (   0.00%)    45812.03 (  75.42%)
Stddev    tput-128    78761.09 (   0.00%)    57418.48 (  27.10%)

Similarly, for embarassingly parallel problems like NPB-ep, there are
improvements due to better spreading across LLC when the machine is not
fully utilised.

                              vanilla       sched-numaimb-v6
Min       ep.D       31.79 (   0.00%)       26.11 (  17.87%)
Amean     ep.D       31.86 (   0.00%)       26.17 *  17.86%*
Stddev    ep.D        0.07 (   0.00%)        0.05 (  24.41%)
CoeffVar  ep.D        0.22 (   0.00%)        0.20 (   7.97%)
Max       ep.D       31.93 (   0.00%)       26.21 (  17.91%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-3-mgorman@techsingularity.net
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 22 +++++++-------
 kernel/sched/topology.c        | 53 ++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9680886277c9..9043256393e4 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -98,6 +98,7 @@ struct sched_domain {
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b0cacfa0372..0b168eacd8bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1524,6 +1524,7 @@ struct task_numa_env {
 
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
+	int imb_numa_nr;
 
 	struct numa_stats src_stats, dst_stats;
 
@@ -1539,7 +1540,7 @@ static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
 static inline long adjust_numa_imbalance(int imbalance,
-					int dst_running, int dst_weight);
+					int dst_running, int imb_numa_nr);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1920,7 +1921,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
 		imbalance = adjust_numa_imbalance(imbalance, dst_running,
-							env->dst_stats.weight);
+						  env->imb_numa_nr);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -1985,8 +1986,10 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-	if (sd)
+	if (sd) {
 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+		env.imb_numa_nr = sd->imb_numa_nr;
+	}
 	rcu_read_unlock();
 
 	/*
@@ -9993,10 +9996,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool
-allow_numa_imbalance(unsigned int running, unsigned int weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
 {
-	return (running < (weight >> 2));
+	return running <= imb_numa_nr;
 }
 
 /*
@@ -10141,7 +10143,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			 * allowed. If there is a real need of migration,
 			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
 				return NULL;
 		}
 
@@ -10321,9 +10323,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 #define NUMA_IMBALANCE_MIN 2
 
 static inline long adjust_numa_imbalance(int imbalance,
-				int dst_running, int dst_weight)
+				int dst_running, int imb_numa_nr)
 {
-	if (!allow_numa_imbalance(dst_running, dst_weight))
+	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
 		return imbalance;
 
 	/*
@@ -10435,7 +10437,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				local->sum_nr_running + 1, local->group_weight);
+				local->sum_nr_running + 1, env->sd->imb_numa_nr);
 		}
 
 		return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3208293d68fa..abdb60ca1c80 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2343,6 +2343,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Calculate an allowed NUMA imbalance such that LLCs do not get
+	 * imbalanced.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned int imb = 0;
+		unsigned int imb_span = 1;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			struct sched_domain *child = sd->child;
+
+			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
+				struct sched_domain *top, *top_p;
+				unsigned int nr_llcs;
+
+				/*
+				 * For a single LLC per node, allow an
+				 * imbalance up to 25% of the node. This is an
+				 * arbitrary cutoff based on SMT-2 to balance
+				 * between memory bandwidth and avoiding
+				 * premature sharing of HT resources and SMT-4
+				 * or SMT-8 *may* benefit from a different
+				 * cutoff.
+				 *
+				 * For multiple LLCs, allow an imbalance
+				 * until multiple tasks would share an LLC
+				 * on one node while LLCs on another node
+				 * remain idle.
+				 */
+				nr_llcs = sd->span_weight / child->span_weight;
+				if (nr_llcs == 1)
+					imb = sd->span_weight >> 2;
+				else
+					imb = nr_llcs;
+				sd->imb_numa_nr = imb;
+
+				/* Set span based on the first NUMA domain. */
+				top = sd;
+				top_p = top->parent;
+				while (top_p && !(top_p->flags & SD_NUMA)) {
+					top = top->parent;
+					top_p = top->parent;
+				}
+				imb_span = top_p ? top_p->span_weight : sd->span_weight;
+			} else {
+				int factor = max(1U, (sd->span_weight / imb_span));
+
+				sd->imb_numa_nr = imb * factor;
+			}
+		}
+	}
+
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
-- 
Gitee


From 00d7e686a2501d837ecd0232a295952308d2acb0 Mon Sep 17 00:00:00 2001
From: Guan Jing <guanjing6@huawei.com>
Date: Sun, 28 May 2023 18:59:35 +0800
Subject: [PATCH 6/8] sched/fair: Fix kabi borken in sched_domain

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8
CVE: NA

--------------------------------

Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 include/linux/sched/topology.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 9043256393e4..62a1e22b057f 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -98,7 +98,6 @@ struct sched_domain {
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */
 
 	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
@@ -154,8 +153,12 @@ struct sched_domain {
 		struct rcu_head rcu;	/* used during destruction */
 	};
 	struct sched_domain_shared *shared;
-
+#ifndef __GENKSYMS__
+	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */
+	KABI_FILL_HOLE(unsigned int kabi_hole)
+#else
 	KABI_RESERVE(1)
+#endif
 	KABI_RESERVE(2)
 
 	unsigned int span_weight;
-- 
Gitee


From 588d8f44c65b01adf1a0cb270292ba8237955a6c Mon Sep 17 00:00:00 2001
From: Guan Jing <guanjing6@huawei.com>
Date: Sun, 28 May 2023 18:59:36 +0800
Subject: [PATCH 7/8] sched: Remove the limitation of WF_ON_CPU on wakelist if
 wakee cpu is idle

mainline inclusion
from mainline-v6.0-rc1
commit f3dd3f674555bd9455c5ae7fafce0696bd9931b3
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=f3dd3f674555bd9455c5ae7fafce0696bd9931b3

--------------------------------

Wakelist can help avoid cache bouncing and offload the overhead of waker
cpu. So far, using wakelist within the same llc only happens on
WF_ON_CPU, and this limitation could be removed to further improve
wakeup performance.

The commit 518cd6234178 ("sched: Only queue remote wakeups when
crossing cache boundaries") disabled queuing tasks on wakelist when
the cpus share llc. This is because, at that time, the scheduler must
send IPIs to do ttwu_queue_wakelist. Nowadays, ttwu_queue_wakelist also
supports TIF_POLLING, so this is not a problem now when the wakee cpu is
in idle polling.

Benefits:
  Queuing the task on idle cpu can help improving performance on waker cpu
  and utilization on wakee cpu, and further improve locality because
  the wakee cpu can handle its own rq. This patch helps improving rt on
  our real java workloads where wakeup happens frequently.

  Consider the normal condition (CPU0 and CPU1 share same llc)
  Before this patch:

         CPU0                                       CPU1

    select_task_rq()                                idle
    rq_lock(CPU1->rq)
    enqueue_task(CPU1->rq)
    notify CPU1 (by sending IPI or CPU1 polling)

                                                    resched()

  After this patch:

         CPU0                                       CPU1

    select_task_rq()                                idle
    add to wakelist of CPU1
    notify CPU1 (by sending IPI or CPU1 polling)

                                                    rq_lock(CPU1->rq)
                                                    enqueue_task(CPU1->rq)
                                                    resched()

  We see CPU0 can finish its work earlier. It only needs to put task to
  wakelist and return.
  While CPU1 is idle, so let itself handle its own runqueue data.

This patch brings no difference about IPI.
  This patch only takes effect when the wakee cpu is:
  1) idle polling
  2) idle not polling

  For 1), there will be no IPI with or without this patch.

  For 2), there will always be an IPI before or after this patch.
  Before this patch: waker cpu will enqueue task and check preempt. Since
  "idle" will be sure to be preempted, waker cpu must send a resched IPI.
  After this patch: waker cpu will put the task to the wakelist of wakee
  cpu, and send an IPI.

Benchmark:
We've tested schbench, unixbench, and hachbench on both x86 and arm64.

On x86 (Intel Xeon Platinum 8269CY):
  schbench -m 2 -t 8

    Latency percentiles (usec)              before        after
        50.0000th:                             8            6
        75.0000th:                            10            7
        90.0000th:                            11            8
        95.0000th:                            12            8
        *99.0000th:                           13           10
        99.5000th:                            15           11
        99.9000th:                            18           14

  Unixbench with full threads (104)
                                            before        after
    Dhrystone 2 using register variables  3011862938    3009935994  -0.06%
    Double-Precision Whetstone              617119.3      617298.5   0.03%
    Execl Throughput                         27667.3       27627.3  -0.14%
    File Copy 1024 bufsize 2000 maxblocks   785871.4      784906.2  -0.12%
    File Copy 256 bufsize 500 maxblocks     210113.6      212635.4   1.20%
    File Copy 4096 bufsize 8000 maxblocks  2328862.2     2320529.1  -0.36%
    Pipe Throughput                      145535622.8   145323033.2  -0.15%
    Pipe-based Context Switching           3221686.4     3583975.4  11.25%
    Process Creation                        101347.1      103345.4   1.97%
    Shell Scripts (1 concurrent)            120193.5      123977.8   3.15%
    Shell Scripts (8 concurrent)             17233.4       17138.4  -0.55%
    System Call Overhead                   5300604.8     5312213.6   0.22%

  hackbench -g 1 -l 100000
                                            before        after
    Time                                     3.246        2.251

On arm64 (Ampere Altra):
  schbench -m 2 -t 8

    Latency percentiles (usec)              before        after
        50.0000th:                            14           10
        75.0000th:                            19           14
        90.0000th:                            22           16
        95.0000th:                            23           16
        *99.0000th:                           24           17
        99.5000th:                            24           17
        99.9000th:                            28           25

  Unixbench with full threads (80)
                                            before        after
    Dhrystone 2 using register variables  3536194249    3537019613   0.02%
    Double-Precision Whetstone              629383.6      629431.6   0.01%
    Execl Throughput                         65920.5       65846.2  -0.11%
    File Copy 1024 bufsize 2000 maxblocks  1063722.8     1064026.8   0.03%
    File Copy 256 bufsize 500 maxblocks     322684.5      318724.5  -1.23%
    File Copy 4096 bufsize 8000 maxblocks  2348285.3     2328804.8  -0.83%
    Pipe Throughput                      133542875.3   131619389.8  -1.44%
    Pipe-based Context Switching           3215356.1     3576945.1  11.25%
    Process Creation                        108520.5      120184.6  10.75%
    Shell Scripts (1 concurrent)            122636.3        121888  -0.61%
    Shell Scripts (8 concurrent)             17462.1       17381.4  -0.46%
    System Call Overhead                   4429998.9     4435006.7   0.11%

  hackbench -g 1 -l 100000
                                            before        after
    Time                                     4.217        2.916

Our patch has improvement on schbench, hackbench
and Pipe-based Context Switching of unixbench
when there exists idle cpus,
and no obvious regression on other tests of unixbench.
This can help improve rt in scenes where wakeup happens frequently.

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lore.kernel.org/r/20220608233412.327341-3-dtcccc@linux.alibaba.com
Signed-off-by: Guan Jing <guanjing6@huawei.com>
---
 kernel/sched/core.c  | 26 ++++++++++++++------------
 kernel/sched/sched.h |  1 -
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d06ac41204c6..e2b6a3dcd2b8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3026,7 +3026,7 @@ bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
 	return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
 }
 
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
 {
 	/*
 	 * If the CPU does not share cache, then queue the task on the
@@ -3035,17 +3035,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 	if (!cpus_share_cache(smp_processor_id(), cpu))
 		return true;
 
+	if (cpu == smp_processor_id())
+		return false;
+
 	/*
-	 * If the task is descheduling and the only running task on the
-	 * CPU then use the wakelist to offload the task activation to
-	 * the soon-to-be-idle CPU as the current CPU is likely busy.
-	 * nr_running is checked to avoid unnecessary task stacking.
+	 * If the wakee cpu is idle, or the task is descheduling and the
+	 * only running task on the CPU, then use the wakelist to offload
+	 * the task activation to the idle (or soon-to-be-idle) CPU as
+	 * the current CPU is likely busy. nr_running is checked to
+	 * avoid unnecessary task stacking.
 	 *
 	 * Note that we can only get here with (wakee) p->on_rq=0,
 	 * p->on_cpu can be whatever, we've done the dequeue, so
 	 * the wakee has been accounted out of ->nr_running.
 	 */
-	if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
+	if (!cpu_rq(cpu)->nr_running)
 		return true;
 
 	return false;
@@ -3053,10 +3057,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 
 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 {
-	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
-		if (WARN_ON_ONCE(cpu == smp_processor_id()))
-			return false;
-
+	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
 		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
 		__ttwu_queue_wakelist(p, cpu, wake_flags);
 		return true;
@@ -3333,7 +3334,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * scheduling.
 	 */
 	if (smp_load_acquire(&p->on_cpu) &&
-	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+	    ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
 		goto unlock;
 
 	/*
@@ -3895,7 +3896,8 @@ static inline void prepare_task(struct task_struct *next)
 	 * Claim the task as running, we do this before switching to it
 	 * such that any running task will have this set.
 	 *
-	 * See the ttwu() WF_ON_CPU case and its ordering comment.
+	 * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+	 * its ordering comment.
 	 */
 	WRITE_ONCE(next->on_cpu, 1);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3bf1e7e3f995..10b0d7e52a98 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2085,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_SYNC			0x01		/* Waker goes to sleep after wakeup */
 #define WF_FORK			0x02		/* Child wakeup after fork */
 #define WF_MIGRATED		0x04		/* Internal use, task got migrated */
-#define WF_ON_CPU		0x08		/* Wakee is on_cpu */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
-- 
Gitee


From a6dcd26feb230d65381197361e026f491e28643b Mon Sep 17 00:00:00 2001
From: Tianchen Ding <dtcccc@linux.alibaba.com>
Date: Sun, 28 May 2023 18:59:37 +0800
Subject: [PATCH 8/8] sched: Clear ttwu_pending after enqueue_task()

mainline inclusion
from mainline-v6.2-rc1
commit d6962c4fe8f96f7d384d6489b6b5ab5bf3e35991
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I78WM8

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.4-rc3&id=d6962c4fe8f96f7d384d6489b6b5ab5bf3e35991

--------------------------------

We found a long tail latency in schbench whem m*t is close to nr_cpus.
(e.g., "schbench -m 2 -t 16" on a machine with 32 cpus.)

This is because when the wakee cpu is idle, rq->ttwu_pending is cleared
too early, and idle_cpu() will return true until the wakee task enqueued.
This will mislead the waker when selecting idle cpu, and wake multiple
worker threads on the same wakee cpu. This situation is enlarged by
commit f3dd3f674555 ("sched: Remove the limitation of WF_ON_CPU on
wakelist if wakee cpu is idle") because it tends to use wakelist.

Here is the result of "schbench -m 2 -t 16" on a VM with 32vcpu
(Intel(R) Xeon(R) Platinum 8369B).

Latency percentiles (usec):
                base      base+revert_f3dd3f674555   base+this_patch
50.0000th:         9                            13                 9
75.0000th:        12                            19                12
90.0000th:        15                            22                15
95.0000th:        18                            24                17
*99.0000th:       27                            31                24
99.5000th:      3364                            33                27
99.9000th:     12560                            36                30

We also tested on unixbench and hackbench, and saw no performance
change.

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lkml.kernel.org/r/20221104023601.12844-1-dtcccc@linux.alibaba.com
---
 kernel/sched/core.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e2b6a3dcd2b8..ee6e459611f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2933,13 +2933,6 @@ void sched_ttwu_pending(void *arg)
 	if (!llist)
 		return;
 
-	/*
-	 * rq::ttwu_pending racy indication of out-standing wakeups.
-	 * Races such that false-negatives are possible, since they
-	 * are shorter lived that false-positives would be.
-	 */
-	WRITE_ONCE(rq->ttwu_pending, 0);
-
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
 
@@ -2953,6 +2946,17 @@ void sched_ttwu_pending(void *arg)
 		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
 	}
 
+	/*
+	 * Must be after enqueueing at least once task such that
+	 * idle_cpu() does not observe a false-negative -- if it does,
+	 * it is possible for select_idle_siblings() to stack a number
+	 * of tasks on this CPU during that window.
+	 *
+	 * It is ok to clear ttwu_pending when another task pending.
+	 * We will receive IPI after local irq enabled and then enqueue it.
+	 * Since now nr_running > 0, idle_cpu() will always get correct result.
+	 */
+	WRITE_ONCE(rq->ttwu_pending, 0);
 	rq_unlock_irqrestore(rq, &rf);
 }
 
-- 
Gitee