From 3b93f5e97295049044ff5ef9e889e7fce603297a Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:38 -0800 Subject: [PATCH 1/4] scheduler: Create SDTL_SKIP flag to skip topology level kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9F5WO CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- A system admin may not want to use cluster scheduling. Make changes to allow cluster topology level to be skipped when building sched domains. Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can check if the cluster topology level should be skipped when building sched domains. Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- include/linux/sched/topology.h | 1 + kernel/sched/topology.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index ead2a8ca1ea3..8e4d9bbdaa40 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -188,6 +188,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 +#define SDTL_SKIP 0x02 struct sd_data { struct sched_domain *__percpu *sd; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5dca59dfea85..86c4d8b2934e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1731,8 +1731,16 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topology; static struct sched_domain_topology_level *sched_domain_topology_saved; +static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) +{ + ++tl; + while (tl->mask && tl->flags & SDTL_SKIP) + ++tl; + return tl; +} + #define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->mask; tl++) + for (tl = sched_domain_topology; tl->mask; tl = next_tl(tl)) void __init set_sched_topology(struct sched_domain_topology_level *tl) { -- Gitee From c89577a6f0f3fb2dfc7c2015fab51a2c35987559 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:40 -0800 Subject: [PATCH 2/4] scheduler: Add runtime knob sysctl_sched_cluster kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9F5WO CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- Allow run time configuration of the scheduler to use cluster scheduling. Configuration can be changed via the sysctl variable /proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster scheduling and setting it to 0 turns it off. Cluster scheduling should benefit independent tasks by load balancing them between clusters. It reaps the most benefit when the system's CPUs are not fully busy, so we can spread the tasks out between the clusters to reduce contention on cluster resource (e.g. L2 cache). However, if the system is expected to operate close to full utilization, the system admin could turn this feature off so as not to incur extra load balancing overhead between the cluster domains. Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- arch/x86/kernel/smpboot.c | 8 +++++ drivers/base/arch_topology.c | 13 +++++--- include/linux/sched/sysctl.h | 6 ++++ include/linux/topology.h | 1 + kernel/sched/core.c | 1 + kernel/sched/sched.h | 6 ++++ kernel/sched/topology.c | 64 ++++++++++++++++++++++++++++++++++++ 7 files changed, 95 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fac3b8a523ac..ed3e07b878f1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -144,6 +145,13 @@ int arch_update_cpu_topology(void) return retval; } +void arch_rebuild_cpu_topology(void) +{ + x86_topology_update = true; + rebuild_sched_domains(); + x86_topology_update = false; +} + static unsigned int smpboot_warm_reset_vector_count; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 2679700d919c..45a1857fa78b 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -260,16 +260,21 @@ int topology_update_cpu_topology(void) return update_topology; } +void __weak arch_rebuild_cpu_topology(void) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + /* * Updating the sched_domains can't be done directly from cpufreq callbacks * due to locking, so queue the work for later. */ static void update_topology_flags_workfn(struct work_struct *work) { - update_topology = 1; - rebuild_sched_domains(); - pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); - update_topology = 0; + arch_rebuild_cpu_topology(); } static u32 *raw_capacity; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5a64582b086b..0aa95deb155e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -29,4 +29,10 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif +#ifdef CONFIG_SCHED_CLUSTER +extern unsigned int sysctl_sched_cluster; +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3..a2f15fd0e527 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -44,6 +44,7 @@ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); +void arch_rebuild_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eff79d6e1e81..1da1ab76a59b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9924,6 +9924,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + set_sched_cluster(); /* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d12206cb6cb8..322b5985575a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1880,6 +1880,12 @@ this_rq_lock_irq(struct rq_flags *rf) return rq; } +#ifdef CONFIG_SCHED_CLUSTER +extern void set_sched_cluster(void); +#else +static inline void set_sched_cluster(void) { } +#endif + #ifdef CONFIG_NUMA #ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 86c4d8b2934e..9de7fdcea10d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1731,6 +1731,70 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topology; static struct sched_domain_topology_level *sched_domain_topology_saved; +#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void) +{ + struct sched_domain_topology_level *tl; + + for (tl = sched_domain_topology; tl->mask; tl++) { + if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { + if (!sysctl_sched_cluster) + tl->flags |= SDTL_SKIP; + else + tl->flags &= ~SDTL_SKIP; + break; + } + } +} + +/* set via /proc/sys/kernel/sched_cluster */ +unsigned int __read_mostly sysctl_sched_cluster = 1; + +static DEFINE_MUTEX(sched_cluster_mutex); +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int oldval; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sched_cluster_mutex); + oldval = sysctl_sched_cluster; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + if (oldval != sysctl_sched_cluster) { + set_sched_cluster(); + arch_rebuild_cpu_topology(); + } + } + mutex_unlock(&sched_cluster_mutex); + + return ret; +} + +static struct ctl_table sched_cluster_sysctls[] = { + { + .procname = "sched_cluster", + .data = &sysctl_sched_cluster, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_cluster_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_cluster_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_cluster_sysctls); + return 0; +} +late_initcall(sched_cluster_sysctl_init); +#endif + static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) { ++tl; -- Gitee From fd903c83776c57273dcb4aa42da0e613ef8594a0 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 3 Dec 2021 12:32:41 -0800 Subject: [PATCH 3/4] scheduler: Add boot time enabling/disabling of cluster scheduling kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9F5WO CVE: NA Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ ---------------------------------------------------------------------- Add boot time parameter sched_cluster to enable or disable cluster scheduling. Set boot parameter as follow: sched_cluster=0 disables cluster scheduling sched_cluster=1 enables cluster scheduling Signed-off-by: Tim Chen Signed-off-by: Jie Liu --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/sched/topology.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27bf689d573b..8900211484ed 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5768,6 +5768,10 @@ sched_verbose [KNL] Enables verbose scheduler debug messages. + sched_cluster= Enable or disable cluster scheduling. + 0 -- disable. + 1 -- enable. + schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature incurs a small amount of overhead in the scheduler diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9de7fdcea10d..742a7d46b5d0 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1793,6 +1793,22 @@ static int __init sched_cluster_sysctl_init(void) return 0; } late_initcall(sched_cluster_sysctl_init); + +static int __init sched_cluster_option(char *str) +{ + int enable; + + if (get_option(&str, &enable)) { + if (enable != 0 && enable != 1) + return -EINVAL; + + sysctl_sched_cluster = enable; + return 0; + } + + return -EINVAL; +} +early_param("sched_cluster", sched_cluster_option); #endif static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) -- Gitee From 35c0a227f4460d1cdd3ca21f4e4af69160f47e17 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 13 Feb 2023 10:48:54 +0800 Subject: [PATCH 4/4] scheduler: Disable cluster scheduling by default kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9F5WO CVE: NA ---------------------------------------------------------------------- Disable cluster scheduling by default since it's not a universal win. User can choose to enable it through sysctl or at boot time according to their scenario. Signed-off-by: Yicong Yang Signed-off-by: Jie Liu --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 742a7d46b5d0..adc5799b9fc1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1748,7 +1748,7 @@ void set_sched_cluster(void) } /* set via /proc/sys/kernel/sched_cluster */ -unsigned int __read_mostly sysctl_sched_cluster = 1; +unsigned int __read_mostly sysctl_sched_cluster; static DEFINE_MUTEX(sched_cluster_mutex); int sched_cluster_handler(struct ctl_table *table, int write, -- Gitee