diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index fad77b5172e2d350681108a5788ae449d3c71d28..a8b28647aafc812d011839852c78ce81aee09ee2 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 9981f661189ea72164a77cc2f9df5638a08d85cf..ead2a8ca1ea359fd6536c12e34cfcf911f013053 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -182,6 +182,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -235,6 +236,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a0997e7e1368f171cc40725f9a6202858c7c589..363bf76622883b459ce6318fc6598fef05afc3a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3960,6 +3960,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318258ea011e989937785112f28e373587166da3..ea5db45c7754b1bae3dbd37c5874854f6c5e53ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7585,6 +7585,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7592,7 +7616,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7708,7 +7732,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7744,8 +7768,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpumask_test_cpu(prev, p->select_cpus) && #endif asym_fits_cpu(task_util, util_min, util_max, prev)) { - SET_STAT(found_idle_cpu_easy); - return prev; + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) { + SET_STAT(found_idle_cpu_easy); + return prev; + } + + prev_aff = prev; } /* @@ -7778,12 +7807,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && #endif asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: - */ - SET_STAT(found_idle_cpu_easy); - return recent_used_cpu; + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake: + */ + SET_STAT(found_idle_cpu_easy); + return recent_used_cpu; + } + } else { + recent_used_cpu = -1; } /* @@ -7830,6 +7864,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } SET_STAT(nofound_idle_cpu); + + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4b679122d26f639ba3cc32cec382c84fe069d962..4dc16e0216528c797d61189ad8a8058219976421 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1917,11 +1917,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9dd172be1d6b483aed6c28b6a8c93286788bc38f..6e53175382ce4f607dd6474994259802014ef48f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -684,11 +684,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -719,6 +722,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1577,6 +1591,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -2497,6 +2512,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2630,12 +2646,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2735,6 +2757,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);