diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 62a1e22b057f4f1d64dce55cc0c187c7b217b4c6..c60eea1c805eb9ea1ebba10f4efccc35cc416e1e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -189,7 +189,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); -bool cpus_share_lowest_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -244,7 +244,7 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } -static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +static inline bool cpus_share_resources(int this_cpu, int that_cpu) { return true; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c8c946ee1de223cb5c84a770abbeeefccd7f9fb..ebd8c3a6a964f71eb483e62b5006daa5c9eb2b55 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3022,15 +3022,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu) } /* - * Whether CPUs are share lowest cache, which means LLC on non-cluster + * Whether CPUs are share cache resources, which means LLC on non-cluster * machines and LLC tag or L2 on machines with clusters. */ -bool cpus_share_lowest_cache(int this_cpu, int that_cpu) +bool cpus_share_resources(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) return true; - return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu); + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); } static inline bool ttwu_queue_cond(int cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6f6ced57cf778fbf2d1cbbe48d50b410ab6908f3..8698094ef8ce4e68dd3115915e328b024f9d8887 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6664,10 +6664,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } if (static_branch_unlikely(&sched_cluster_active)) { - struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target)); + struct sched_group *sg = sd->groups; - if (sdc) { - for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target) { if (!cpumask_test_cpu(cpu, cpus)) continue; @@ -6683,7 +6683,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return idle_cpu; } } - cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); + cpumask_andnot(cpus, cpus, sched_group_span(sg)); } } @@ -6778,7 +6778,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; unsigned long task_util; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -6806,14 +6806,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_lowest_cache(prev, target) && + if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(prev, p->select_cpus) && #endif asym_fits_capacity(task_util, prev)) { SET_STAT(found_idle_cpu_easy); - return prev; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; } /* @@ -6837,7 +6842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_lowest_cache(recent_used_cpu, target) && + cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && @@ -6851,7 +6856,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ SET_STAT(found_idle_cpu_easy); p->recent_used_cpu = prev; - return recent_used_cpu; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; } /* @@ -6888,6 +6899,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } SET_STAT(nofound_idle_cpu); + + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91ae933c20be27131059570f5003a6fcaf458d31..2c82deee946adcf30d49a12c610760bf0635f043 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1844,9 +1844,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(int, sd_lowest_cache_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); @@ -1880,7 +1879,7 @@ struct sched_group { struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ - KABI_RESERVE(1) + KABI_USE(1, int flags) KABI_RESERVE(2) /* * The CPUs this group covers. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 0058681e7f434c3ace3269f30943d059d93724ac..e7413d6dd75b88ee50d0ef655ed15598b582fd86 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -647,8 +647,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(int, sd_lowest_cache_id); -DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -689,14 +688,13 @@ static void update_top_cache_domain(int cpu) sd = lowest_flag_domain(cpu, SD_CLUSTER); if (sd) id = cpumask_first(sched_domain_span(sd)); - rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); /* * This assignment should be placed after the sd_llc_id as * we want this id equals to cluster id on cluster machines * but equals to LLC id on non-Cluster machines. */ - per_cpu(sd_lowest_cache_id, cpu) = id; + per_cpu(sd_share_id, cpu) = id; sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -727,8 +725,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; - if (parent->parent) + + if (parent->parent) { parent->parent->child = tmp; + parent->parent->groups->flags = tmp->flags; + } + /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this @@ -745,8 +747,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* + * sched groups hold the flags of the child sched + * domain for convenience. Clear such flags since + * the child is being destroyed. + */ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } for (tmp = sd; tmp; tmp = tmp->parent) @@ -945,10 +959,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(&sg->ref); return sg; @@ -1198,6 +1214,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); @@ -2366,7 +2383,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; - has_cluster |= sd->flags & SD_CLUSTER; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; @@ -2474,6 +2490,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); @@ -2583,7 +2602,7 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); - if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) + if (static_branch_unlikely(&sched_cluster_active)) static_branch_dec_cpuslocked(&sched_cluster_active); rcu_read_lock();