diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 62a1e22b057f4f1d64dce55cc0c187c7b217b4c6..c60eea1c805eb9ea1ebba10f4efccc35cc416e1e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -189,7 +189,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
-bool cpus_share_lowest_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -244,7 +244,7 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
-static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
 {
 	return true;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8c8c946ee1de223cb5c84a770abbeeefccd7f9fb..ebd8c3a6a964f71eb483e62b5006daa5c9eb2b55 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3022,15 +3022,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 }
 
 /*
- * Whether CPUs are share lowest cache, which means LLC on non-cluster
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
  * machines and LLC tag or L2 on machines with clusters.
  */
-bool cpus_share_lowest_cache(int this_cpu, int that_cpu)
+bool cpus_share_resources(int this_cpu, int that_cpu)
 {
 	if (this_cpu == that_cpu)
 		return true;
 
-	return per_cpu(sd_lowest_cache_id, this_cpu) == per_cpu(sd_lowest_cache_id, that_cpu);
+	return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
 }
 
 static inline bool ttwu_queue_cond(int cpu)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6f6ced57cf778fbf2d1cbbe48d50b410ab6908f3..8698094ef8ce4e68dd3115915e328b024f9d8887 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6664,10 +6664,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	}
 
 	if (static_branch_unlikely(&sched_cluster_active)) {
-		struct sched_domain *sdc = rcu_dereference(per_cpu(sd_cluster, target));
+		struct sched_group *sg = sd->groups;
 
-		if (sdc) {
-			for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) {
+		if (sg->flags & SD_CLUSTER) {
+			for_each_cpu_wrap(cpu, sched_group_span(sg), target) {
 				if (!cpumask_test_cpu(cpu, cpus))
 					continue;
 
@@ -6683,7 +6683,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 						return idle_cpu;
 				}
 			}
-			cpumask_andnot(cpus, cpus, sched_domain_span(sdc));
+			cpumask_andnot(cpus, cpus, sched_group_span(sg));
 		}
 	}
 
@@ -6778,7 +6778,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
 	struct sched_domain *sd;
 	unsigned long task_util;
-	int i, recent_used_cpu;
+	int i, recent_used_cpu, prev_aff = -1;
 
 	/*
 	 * On asymmetric system, update task utilization because we will check
@@ -6806,14 +6806,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_lowest_cache(prev, target) &&
+	if (prev != target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	    cpumask_test_cpu(prev, p->select_cpus) &&
 #endif
 	    asym_fits_capacity(task_util, prev)) {
 		SET_STAT(found_idle_cpu_easy);
-		return prev;
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(prev, target))
+			return prev;
+
+		prev_aff = prev;
 	}
 
 	/*
@@ -6837,7 +6842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	recent_used_cpu = p->recent_used_cpu;
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
-	    cpus_share_lowest_cache(recent_used_cpu, target) &&
+	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	    cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) &&
@@ -6851,7 +6856,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		 */
 		SET_STAT(found_idle_cpu_easy);
 		p->recent_used_cpu = prev;
-		return recent_used_cpu;
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(recent_used_cpu, target))
+			return recent_used_cpu;
+
+	} else {
+		recent_used_cpu = -1;
 	}
 
 	/*
@@ -6888,6 +6899,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	}
 
 	SET_STAT(nofound_idle_cpu);
+
+	/*
+	 * For cluster machines which have lower sharing cache like L2 or
+	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+	 * use them if possible when no idle CPU found in select_idle_cpu().
+	 */
+	if ((unsigned int)prev_aff < nr_cpumask_bits)
+		return prev_aff;
+	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+		return recent_used_cpu;
+
 	return target;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91ae933c20be27131059570f5003a6fcaf458d31..2c82deee946adcf30d49a12c610760bf0635f043 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1844,9 +1844,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(int, sd_lowest_cache_id);
+DECLARE_PER_CPU(int, sd_share_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
@@ -1880,7 +1879,7 @@ struct sched_group {
 	struct sched_group_capacity *sgc;
 	int			asym_prefer_cpu;	/* CPU of highest priority in group */
 
-	KABI_RESERVE(1)
+	KABI_USE(1, int flags)
 	KABI_RESERVE(2)
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 0058681e7f434c3ace3269f30943d059d93724ac..e7413d6dd75b88ee50d0ef655ed15598b582fd86 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -647,8 +647,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(int, sd_lowest_cache_id);
-DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster);
+DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -689,14 +688,13 @@ static void update_top_cache_domain(int cpu)
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
 	if (sd)
 		id = cpumask_first(sched_domain_span(sd));
-	rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd);
 
 	/*
 	 * This assignment should be placed after the sd_llc_id as
 	 * we want this id equals to cluster id on cluster machines
 	 * but equals to LLC id on non-Cluster machines.
 	 */
-	per_cpu(sd_lowest_cache_id, cpu) = id;
+	per_cpu(sd_share_id, cpu) = id;
 
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -727,8 +725,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
-			if (parent->parent)
+
+			if (parent->parent) {
 				parent->parent->child = tmp;
+				parent->parent->groups->flags = tmp->flags;
+			}
+
 			/*
 			 * Transfer SD_PREFER_SIBLING down in case of a
 			 * degenerate parent; the spans match for this
@@ -745,8 +747,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 		tmp = sd;
 		sd = sd->parent;
 		destroy_sched_domain(tmp);
-		if (sd)
+		if (sd) {
+			struct sched_group *sg = sd->groups;
+
+			/*
+			 * sched groups hold the flags of the child sched
+			 * domain for convenience. Clear such flags since
+			 * the child is being destroyed.
+			 */
+			do {
+				sg->flags = 0;
+			} while (sg != sd->groups);
+
 			sd->child = NULL;
+		}
 	}
 
 	for (tmp = sd; tmp; tmp = tmp->parent)
@@ -945,10 +959,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 		return NULL;
 
 	sg_span = sched_group_span(sg);
-	if (sd->child)
+	if (sd->child) {
 		cpumask_copy(sg_span, sched_domain_span(sd->child));
-	else
+		sg->flags = sd->child->flags;
+	} else {
 		cpumask_copy(sg_span, sched_domain_span(sd));
+	}
 
 	atomic_inc(&sg->ref);
 	return sg;
@@ -1198,6 +1214,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 	if (child) {
 		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
 		cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+		sg->flags = child->flags;
 	} else {
 		cpumask_set_cpu(cpu, sched_group_span(sg));
 		cpumask_set_cpu(cpu, group_balance_mask(sg));
@@ -2366,7 +2383,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 
 			has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
-			has_cluster |= sd->flags & SD_CLUSTER;
 
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
@@ -2474,6 +2490,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
 
 		cpu_attach_domain(sd, d.rd, i);
+
+		if (lowest_flag_domain(i, SD_CLUSTER))
+			has_cluster = true;
 	}
 	rcu_read_unlock();
 
@@ -2583,7 +2602,7 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
 	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
 		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
 
-	if (rcu_access_pointer(per_cpu(sd_cluster, cpu)))
+	if (static_branch_unlikely(&sched_cluster_active))
 		static_branch_dec_cpuslocked(&sched_cluster_active);
 
 	rcu_read_lock();