diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 703fe5bc2535dca3d1b4bbb75a4d1e368ff2f34b..b5d7df280924d5c09fbc65ab8008db02b5c3d58f 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -178,6 +178,7 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +CONFIG_QOS_SCHED_SMART_GRID=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index c9e816e54003230a5504f29d5427055a9a9fee06..68f1dc7ab2af5e2b4c98c6bfe2afdd900d1f50b5 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -199,6 +199,7 @@ CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +# CONFIG_QOS_SCHED_SMART_GRID is not set CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 60ed89000e82dc29c08abc63a0bba90be6e6ad79..2322f6647372e569c9d345be2947639aeeb5c1a7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2823,6 +2823,247 @@ int cpufreq_boost_enabled(void) } EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); +#ifdef CONFIG_QOS_SCHED_SMART_GRID + +struct smart_grid_zone { + char governor_name[SMART_GRID_ZONE_NR][CPUFREQ_NAME_LEN]; + unsigned int enable; + struct irq_work irq_work; + struct work_struct work; + unsigned int is_init; +}; + +static struct smart_grid_zone sg_zone; +static DEFINE_MUTEX(sg_zone_lock); + +#define SG_WRITE_BUFF_LEN 30 + +void cpufreq_smart_grid_start_sync(void) +{ + /* No need sync when smart grid disabled */ + if (!smart_grid_enabled()) + return; + + if (likely(sg_zone.is_init)) + irq_work_queue(&sg_zone.irq_work); +} + +static ssize_t show_smart_grid_governor(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int len = 0; + int gov_index; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return sprintf(buf, "smart_grid governor disable\n"); + } + + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) + len += sprintf(buf + len, "smart_grid-%d: %s\n", gov_index, + sg_zone.governor_name[gov_index]); + + mutex_unlock(&sg_zone_lock); + return len; +} + +static ssize_t store_smart_grid_governor(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct cpufreq_governor *target_gov = NULL; + unsigned int current_level; + char *level_string = NULL; + char buf_string[SG_WRITE_BUFF_LEN]; + char *gov_string = buf_string; + char save_string[CPUFREQ_NAME_LEN]; + int ret; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + ret = -EINVAL; + goto fail; + } + + if (strscpy(buf_string, buf, SG_WRITE_BUFF_LEN) <= 0) { + ret = -EINVAL; + goto fail; + } + + level_string = strsep(&gov_string, "-"); + if (level_string == NULL) { + ret = -EINVAL; + goto fail; + } + + if (kstrtouint(level_string, 10, ¤t_level)) { + ret = -EINVAL; + goto fail; + } + + if (current_level >= SMART_GRID_ZONE_NR) { + ret = -EINVAL; + goto fail; + } + + if (sscanf(gov_string, "%15s", save_string) != 1) { + ret = -EINVAL; + goto fail; + } + + target_gov = cpufreq_parse_governor(save_string); + if (target_gov == NULL) { + ret = -EINVAL; + goto fail; + } + module_put(target_gov->owner); + + strscpy(sg_zone.governor_name[current_level], save_string, CPUFREQ_NAME_LEN); + cpufreq_smart_grid_start_sync(); + mutex_unlock(&sg_zone_lock); + return count; + +fail: + mutex_unlock(&sg_zone_lock); + return ret; +} +define_one_global_rw(smart_grid_governor); + +static ssize_t show_smart_grid_governor_enable(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sg_zone.enable); +} + +static void smart_grid_irq_work(struct irq_work *irq_work) +{ + struct smart_grid_zone *zone; + + zone = container_of(irq_work, struct smart_grid_zone, irq_work); + schedule_work_on(smp_processor_id(), &zone->work); +} + +static void smart_grid_work_handler(struct work_struct *work) +{ + struct smart_grid_zone *zone; + struct cpufreq_governor *target_gov = NULL; + struct cpufreq_policy *policy = NULL; + unsigned int cpu; + int gov_index; + + zone = container_of(work, struct smart_grid_zone, work); + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return; + } + + /* + * Because of the policy may be shared between hot and warm zone. + * We need to make sure hot zone have the highest priority. + */ + for (gov_index = SMART_GRID_ZONE_NR - 1; gov_index >= 0; gov_index--) { + target_gov = cpufreq_parse_governor(sg_zone.governor_name[gov_index]); + if (target_gov == NULL) + continue; + + for_each_cpu(cpu, sched_grid_zone_cpumask(gov_index)) { + if (cpu_is_offline(cpu)) + continue; + + policy = cpufreq_cpu_acquire(cpu); + if (policy == NULL) + continue; + + if (policy->governor == target_gov) { + cpufreq_cpu_release(policy); + continue; + } + /*Try to switch governor */ + store_scaling_governor(policy, sg_zone.governor_name[gov_index], + CPUFREQ_NAME_LEN); + cpufreq_cpu_release(policy); + } + module_put(target_gov->owner); + } + mutex_unlock(&sg_zone_lock); +} + +static void sg_zone_set_enable(void) +{ + int gov_index; + + /* Set default smart_grid governor */ + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) { + if (!gov_index) + strscpy(sg_zone.governor_name[gov_index], "performance", CPUFREQ_NAME_LEN); + else + strscpy(sg_zone.governor_name[gov_index], "powersave", CPUFREQ_NAME_LEN); + } + + sg_zone.enable = 1; + cpufreq_smart_grid_start_sync(); +} + +static void sg_zone_set_disable(void) +{ + sg_zone.enable = 0; +} + +static ssize_t store_smart_grid_governor_enable(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int enable; + + if (kstrtouint(buf, 10, &enable)) + return -EINVAL; + + if (enable > 1) + return -EINVAL; + + mutex_lock(&sg_zone_lock); + if (sg_zone.enable == enable) { + mutex_unlock(&sg_zone_lock); + return -EINVAL; + } + + if (enable) + sg_zone_set_enable(); + else + sg_zone_set_disable(); + + mutex_unlock(&sg_zone_lock); + return count; +} +define_one_global_rw(smart_grid_governor_enable); + +static int create_smart_grid_sysfs_file(void) +{ + int ret; + + /* No need init when smart grid disabled */ + if (!smart_grid_enabled()) + return 0; + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor sysfs file\n", + __func__); + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor_enable.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor_enable sysfs file\n", + __func__); + + init_irq_work(&sg_zone.irq_work, smart_grid_irq_work); + INIT_WORK(&sg_zone.work, smart_grid_work_handler); + sg_zone.enable = 0; + sg_zone.is_init = 1; + return ret; +} +#endif + /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ @@ -2998,6 +3239,9 @@ static int __init cpufreq_core_init(void) if (!strlen(default_governor)) strncpy(default_governor, gov->name, CPUFREQ_NAME_LEN); +#ifdef CONFIG_QOS_SCHED_SMART_GRID + create_smart_grid_sysfs_file(); +#endif return 0; } module_param(off, int, 0444); diff --git a/fs/proc/array.c b/fs/proc/array.c index 2c2efbe685d872201175a02377b818b1dbf31892..0aeaeb9d2b48c70babebdc4f1e59969dcabf0a5c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -436,6 +436,19 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m, { } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) +{ + if (!dynamic_affinity_enabled()) + return; + + seq_printf(m, "Cpus_preferred:\t%*pb\n", + cpumask_pr_args(task->prefer_cpus)); + seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", + cpumask_pr_args(task->prefer_cpus)); +} +#endif + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -461,6 +474,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); arch_proc_pid_thread_features(m, task); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + task_cpus_preferred(m, task); +#endif return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index e04b0126334f991775223b2b1149f9447ac712f2..0d9fbcd315ec9694fd62b0613c94b34549db4077 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -92,6 +92,10 @@ #include #include #include +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#include +#endif #include #include #include @@ -3313,6 +3317,83 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_STACKLEAK_METRICS */ +#ifdef CONFIG_QOS_SCHED_SMART_GRID +static int smart_grid_level_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + if (!smart_grid_enabled()) + return -EPERM; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->grid_qos != NULL) + seq_printf(m, "%d\n", p->grid_qos->stat.class_lvl); + + put_task_struct(p); + + return 0; +} + +static int smart_grid_level_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, smart_grid_level_show, inode); +} + +static ssize_t smart_grid_level_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned int level = SCHED_GRID_QOS_TASK_LEVEL_MAX; + int ret = 0; + + if (!smart_grid_enabled()) + return -EPERM; + + memset(buffer, 0, sizeof(buffer)); + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (kstrtouint(buffer, 10, &level)) { + put_task_struct(p); + return -EINVAL; + } + + if (level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) { + put_task_struct(p); + return -EINVAL; + } + + if (p->grid_qos != NULL && + p->grid_qos->stat.set_class_lvl != NULL) + ret = p->grid_qos->stat.set_class_lvl(&p->grid_qos->stat, level); + + put_task_struct(p); + + if (ret) + return ret; + return count; +} + +static const struct file_operations proc_pid_sg_level_operations = { + .open = smart_grid_level_open, + .read = seq_read, + .write = smart_grid_level_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Thread groups */ @@ -3336,6 +3417,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + REG("smart_grid_level", 0644, proc_pid_sg_level_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 71d186d6933a5e5f700335e4bd29eb47d59ee1f8..066152b4ed3bcedce92b996349502f4c6b72bb2a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#endif /********************************************************************* * CPUFREQ INTERFACE * @@ -620,6 +623,14 @@ int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); int cpufreq_start_governor(struct cpufreq_policy *policy); void cpufreq_stop_governor(struct cpufreq_policy *policy); +#ifdef CONFIG_QOS_SCHED_SMART_GRID +/* Implement in cpufreq.c */ +#ifdef CONFIG_CPU_FREQ +void cpufreq_smart_grid_start_sync(void); +#else +static inline void cpufreq_smart_grid_start_sync(void) { return; } +#endif +#endif #define cpufreq_governor_init(__governor) \ static int __init __governor##_init(void) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index bd9031f5772c49d1f94ab471e5a6709acbdbe800..dafcae3e0ec9abf651bb1b56bdf3df0b6a81486c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1555,6 +1555,11 @@ struct task_struct { #ifdef CONFIG_PSI_FINE_GRAINED int memstall_type; #endif + +#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) + struct sched_grid_qos *grid_qos; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -2511,4 +2516,31 @@ static inline bool dynamic_affinity_enabled(void) return static_branch_unlikely(&__dynamic_affinity_switch); } #endif + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern struct static_key __smart_grid_used; +extern struct static_key_false __smart_grid_switch; + +static inline bool smart_grid_enabled(void) +{ + /* smart grid need dynamic affinity enabled first */ + if (!static_branch_unlikely(&__dynamic_affinity_switch)) + return false; + + return static_branch_unlikely(&__smart_grid_switch); +} + +static inline bool smart_grid_used(void) +{ + if (!smart_grid_enabled()) + return false; + + return static_key_false(&__smart_grid_used); +} +#else +static inline bool smart_grid_used(void) +{ + return false; +} +#endif #endif diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h new file mode 100644 index 0000000000000000000000000000000000000000..fe334355bb4923ab941f02ef8ba6bc4eb74d2798 --- /dev/null +++ b/include/linux/sched/grid_qos.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_GRID_QOS_H +#define _LINUX_SCHED_GRID_QOS_H +#include +#include + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +enum sched_grid_qos_class { + SCHED_GRID_QOS_CLASS_LEVEL_1 = 0, + SCHED_GRID_QOS_CLASS_LEVEL_2 = 1, + SCHED_GRID_QOS_CLASS_LEVEL_3 = 2, + SCHED_GRID_QOS_CLASS_LEVEL_4 = 3, + SCHED_GRID_QOS_CLASS_LEVEL_5 = 4, + SCHED_GRID_QOS_CLASS_LEVEL_6 = 5, + SCHED_GRID_QOS_CLASS_LEVEL_7 = 6, + SCHED_GRID_QOS_CLASS_LEVEL_8 = 7, + SCHED_GRID_QOS_CLASS_LEVEL_NR +}; + +/* + * SCHED_GRID_QOS_TASK_LEVEL was defined different QoS level. + * The lower number has the higher priority. (E.g. 0 was the highest) + * The enum sched_grid_qos_class defined the max level, the lowest level. + */ +#define SCHED_GRID_QOS_TASK_LEVEL_HIGHEST SCHED_GRID_QOS_CLASS_LEVEL_1 +#define SCHED_GRID_QOS_TASK_LEVEL_MAX (SCHED_GRID_QOS_CLASS_LEVEL_NR) +#define SCHED_GRID_QOS_TASK_LEVEL_DEFAULT (SCHED_GRID_QOS_CLASS_LEVEL_NR - 1) + +enum { + SCHED_GRID_QOS_IPS_INDEX = 0, + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1, + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2, + SCHED_GRID_QOS_SAMPLE_NR +}; + +#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100 + +struct sched_grid_qos_ring_buffer { + u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN]; + unsigned int head; + void (*push)(u64 *data, int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); +}; + +struct sched_grid_qos_sample { + const char *name; + int index; + int sample_bypass; + int sample_times; + struct sched_grid_qos_ring_buffer ring_buffer; + u64 pred_target[MAX_NUMNODES]; + void (*cal_target)(int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); + + int account_ready; + int (*start)(void *arg); + int (*account)(void *arg); +}; + +struct sched_grid_qos_stat { + enum sched_grid_qos_class class_lvl; + int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat, int level); + struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR]; +}; + +struct sched_grid_qos_power { + int cpufreq_sense_ratio; + int target_cpufreq; + int cstate_sense_ratio; +}; + +struct sched_grid_qos_affinity { + nodemask_t mem_preferred_node_mask; + const struct cpumask *prefer_cpus; +}; + +struct task_struct; +struct sched_grid_qos { + struct sched_grid_qos_stat stat; + struct sched_grid_qos_power power; + struct sched_grid_qos_affinity affinity; + + int (*affinity_set)(struct task_struct *p); +}; + +static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return p->grid_qos->affinity_set(p); +} + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig); +void sched_grid_qos_free(struct task_struct *p); + +int sched_grid_preferred_interleave_nid(struct mempolicy *policy); +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask); + +enum sg_zone_type { + SMART_GRID_ZONE_HOT = 0, + SMART_GRID_ZONE_WARM, + SMART_GRID_ZONE_NR +}; + +struct auto_affinity; +struct sched_grid_zone { + raw_spinlock_t lock; + struct cpumask cpus[SMART_GRID_ZONE_NR]; + struct list_head af_list_head; /* struct auto_affinity list head */ +}; + +int __init sched_grid_zone_init(void); +int sched_grid_zone_update(bool is_locked); +int sched_grid_zone_add_af(struct auto_affinity *af); +int sched_grid_zone_del_af(struct auto_affinity *af); +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone); +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p); +#else +static inline int __init sched_grid_zone_init(void) { return 0; } + +static inline int +sched_grid_preferred_interleave_nid(struct mempolicy *policy) +{ + return NUMA_NO_NODE; +} +static inline int +sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) +{ + return preferred_nid; +} + +static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return 0; +} +#endif +#endif diff --git a/init/Kconfig b/init/Kconfig index f6a08293f75b7f8c891c7061e72ee5303ce1b376..75c4ac6040c301b2f880ff8217a90d5a018cadf2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1152,6 +1152,19 @@ config UCLAMP_TASK_GROUP If in doubt, say N. +config QOS_SCHED_SMART_GRID + bool "qos smart grid scheduler" + depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY + default n + help + This feature is used for power consumption tuning in server scenario. + This can be divided into the following aspects: + 1. User interface, manage user needs. + 2. Collect tasks' features to ensure key tasks' QOS. + 3. Weaken the influence the impact of CPU frequency and cpuidle + adjustment on tasks. + 4. Docking EAS (Energy Aware Scheduling) model. + config CGROUP_PIDS bool "PIDs controller" help diff --git a/kernel/fork.c b/kernel/fork.c index a1cd8930c3e10b73a6eeb742b0f9399bd581f03e..fd0405523f07f5ee8d4b5eba6716fad6dcce44f4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,9 @@ #include #include #include +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#endif #include #include @@ -628,6 +631,10 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (dynamic_affinity_enabled()) sched_prefer_cpus_free(tsk); +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + if (smart_grid_enabled()) + sched_grid_qos_free(tsk); #endif free_task_struct(tsk); } @@ -2389,6 +2396,14 @@ __latent_entropy struct task_struct *copy_process( } current->flags &= ~PF_NPROC_EXCEEDED; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + if (smart_grid_enabled()) { + retval = sched_grid_qos_fork(p, current); + if (retval) + goto bad_fork_cleanup_count; + } +#endif + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd4520ebc1a607734520ac342585a120..cd0be22a94fd0df9d27c569f4f418f1c386387e6 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -32,3 +32,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bafb7b440263e38784aea95860fe18ef15baa9ce..44bdd78dc1c7a65c29d95cad1e599ea889a99519 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -96,6 +96,8 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -9709,6 +9711,7 @@ int sched_cpu_activate(unsigned int cpu) static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); + tg_update_affinity_domains(cpu, 1); if (sched_smp_initialized) { sched_update_numa(cpu, true); @@ -9798,6 +9801,7 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); + tg_update_affinity_domains(cpu, 0); return 0; } @@ -9918,6 +9922,9 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_smp_initialized = true; + + sched_grid_zone_init(); + init_auto_affinity(&root_task_group); } static int __init migration_init(void) @@ -11329,6 +11336,159 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (unlikely(!auto_affi)) + return -EPERM; + + /* auto mode */ + if (mode == 1) + start_auto_affinity(auto_affi); + else if (mode == 0) + stop_auto_affinity(auto_affi); + else + return -EINVAL; + + return 0; +} + +static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (!smart_grid_enabled()) + return -EPERM; + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->mode; +} + +static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mode) +{ + if (!smart_grid_enabled()) + return -EPERM; + + return tg_set_dynamic_affinity_mode(css_tg(css), mode); +} + +int tg_set_affinity_period(struct task_group *tg, u64 period_ms) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + if (!period_ms || period_ms > U64_MAX / NSEC_PER_MSEC) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + tg->auto_affinity->period = ms_to_ktime(period_ms); + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +u64 tg_get_affinity_period(struct task_group *tg) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return ktime_to_ms(tg->auto_affinity->period); +} + +static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period) +{ + if (!smart_grid_enabled()) + return -EPERM; + + return tg_set_affinity_period(css_tg(css), period); +} + +static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + if (!smart_grid_enabled()) + return -EPERM; + + return tg_get_affinity_period(css_tg(css)); +} + +static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 mask) +{ + struct task_group *tg = css_tg(css); + struct affinity_domain *ad; + u16 full; + + if (!smart_grid_enabled()) + return -EPERM; + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + ad = &tg->auto_affinity->ad; + full = (1 << ad->dcount) - 1; + if (mask > full) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + ad->domain_mask = mask; + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (!smart_grid_enabled()) + return -EPERM; + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->ad.domain_mask; +} + +static int cpu_affinity_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int i; + + /* No stat when smart grid disabled */ + if (!smart_grid_enabled()) + return -EPERM; + + if (unlikely(!auto_affi)) + return -EPERM; + + ad = &auto_affi->ad; + seq_printf(sf, "period_active %d\n", auto_affi->period_active); + seq_printf(sf, "dcount %d\n", ad->dcount); + seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); + seq_printf(sf, "curr_level %d\n", ad->curr_level); + seq_printf(sf, "zone hot %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_HOT))); + seq_printf(sf, "zone warm %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_WARM))); + for (i = 0; i < ad->dcount; i++) + seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", + i, cpumask_pr_args(ad->domains[i]), + schedstat_val(ad->stay_cnt[i])); + + return 0; +} +#endif /* CONFIG_QOS_SCHED_SMART_GRID */ + #ifdef CONFIG_QOS_SCHED static int tg_change_scheduler(struct task_group *tg, void *data) { @@ -11413,6 +11573,27 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .name = "dynamic_affinity_mode", + .read_u64 = cpu_affinity_mode_read_u64, + .write_u64 = cpu_affinity_mode_write_u64, + }, + { + .name = "affinity_period_ms", + .read_u64 = cpu_affinity_period_read_uint, + .write_u64 = cpu_affinity_period_write_uint, + }, + { + .name = "affinity_domain_mask", + .read_u64 = cpu_affinity_domain_mask_read_u64, + .write_u64 = cpu_affinity_domain_mask_write_u64, + }, + { + .name = "affinity_stat", + .seq_show = cpu_affinity_stat_show, + }, +#endif #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_quota_us", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e32aae696ba7ce40e34a1fb3ab893469bdf760e..80c7232b2aaf226d1031e7929de83d0c72130dce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,6 +69,7 @@ #ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" #endif +#include /* * The initial- and re-scaling of tunables is configurable @@ -203,6 +204,11 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; int sysctl_sched_util_low_pct = 85; #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern unsigned int sysctl_smart_grid_strategy_ctrl; +static int sysctl_affinity_adjust_delay_ms = 5000; +#endif + #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { { @@ -282,6 +288,26 @@ static struct ctl_table sched_fair_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .procname = "smart_grid_strategy_ctrl", + .data = &sysctl_smart_grid_strategy_ctrl, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "affinity_adjust_delay_ms", + .data = &sysctl_affinity_adjust_delay_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &hundred_thousand, + }, #endif {} }; @@ -6772,6 +6798,480 @@ bool cfs_task_bw_constrained(struct task_struct *p) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + +DEFINE_STATIC_KEY_FALSE(__smart_grid_switch); + +static int __init smart_grid_switch_setup(char *__unused) +{ + static_branch_enable(&__smart_grid_switch); + return 1; +} +__setup("smart_grid", smart_grid_switch_setup); + +#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 2000 +#define IS_DOMAIN_SET(level, mask) ((1 << (level)) & (mask)) + +static DEFINE_MUTEX(smart_grid_used_mutex); + +static unsigned long capacity_of(int cpu); +static int sched_idle_cpu(int cpu); +static unsigned long cpu_runnable(struct rq *rq); +static inline bool prefer_cpus_valid(struct task_struct *p); + +struct static_key __smart_grid_used; + +static void smart_grid_usage_inc(void) +{ + static_key_slow_inc(&__smart_grid_used); +} + +static void smart_grid_usage_dec(void) +{ + static_key_slow_dec(&__smart_grid_used); +} + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + if (!smart_grid_used()) + return p->prefer_cpus; + + if (task_group(p)->auto_affinity->mode == 0) + return (void *)p->cpus_ptr; + + return sched_grid_prefer_cpus(p); +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + if (smart_grid_used()) + return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; + + return 0; +} + +static void affinity_domain_up(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level >= ad->dcount - 1) + return; + + while (level < ad->dcount) { + if (IS_DOMAIN_SET(level + 1, ad->domain_mask) && + cpumask_weight(ad->domains[level + 1]) > 0) { + ad->curr_level = level + 1; + sched_grid_zone_update(false); + return; + } + level++; + } +} + +static void affinity_domain_down(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + while (level > 0) { + if (!cpumask_weight(ad->domains[level - 1])) + return; + + if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { + ad->curr_level = level - 1; + sched_grid_zone_update(false); + return; + } + level--; + } +} + +static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) +{ + struct auto_affinity *auto_affi = + container_of(timer, struct auto_affinity, period_timer); + struct task_group *tg = auto_affi->tg; + struct affinity_domain *ad = &auto_affi->ad; + struct cpumask *span = ad->domains[ad->curr_level]; + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned long flags; + int cpu; + + for_each_cpu(cpu, span) { + util_avg_sum += cpu_util_cfs(cpu); + tg_capacity += capacity_of(cpu); + } + + raw_spin_lock_irqsave(&auto_affi->lock, flags); + /* May be re-entrant by stop_auto_affinity, So check again. */ + if (auto_affi->period_active == 0) { + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_NORESTART; + } + + if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) { + affinity_domain_up(tg); + } else if (util_avg_sum * 100 < tg_capacity * + sysctl_sched_util_low_pct / 2) { + affinity_domain_down(tg); + } + + schedstat_inc(ad->stay_cnt[ad->curr_level]); + hrtimer_forward_now(timer, auto_affi->period); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_RESTART; +} + +static int tg_update_affinity_domain_down(struct task_group *tg, void *data) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int *cpu_state = data; + unsigned long flags; + int i; + + if (!auto_affi) + return 0; + + ad = &tg->auto_affinity->ad; + raw_spin_lock_irqsave(&auto_affi->lock, flags); + + for (i = 0; i < ad->dcount; i++) { + if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i])) + continue; + + /* online */ + if (cpu_state[1]) { + cpumask_set_cpu(cpu_state[0], ad->domains[i]); + } else { + cpumask_clear_cpu(cpu_state[0], ad->domains[i]); + if (!cpumask_weight(ad->domains[i])) + affinity_domain_up(tg); + } + + } + sched_grid_zone_update(false); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + + return 0; +} + +void tg_update_affinity_domains(int cpu, int online) +{ + int cpu_state[2]; + + /* No need update when smart gird disabled */ + if (!smart_grid_enabled()) + return; + + cpu_state[0] = cpu; + cpu_state[1] = online; + + rcu_read_lock(); + walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state); + rcu_read_unlock(); +} + +void start_auto_affinity(struct auto_affinity *auto_affi) +{ + ktime_t delay_ms; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 1) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + + auto_affi->period_active = 1; + auto_affi->mode = 1; + delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); + hrtimer_forward_now(&auto_affi->period_timer, delay_ms); + hrtimer_start_expires(&auto_affi->period_timer, + HRTIMER_MODE_ABS_PINNED); + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_inc(); + mutex_unlock(&smart_grid_used_mutex); +} + +void stop_auto_affinity(struct auto_affinity *auto_affi) +{ + struct affinity_domain *ad = &auto_affi->ad; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 0) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + auto_affi->period_active = 0; + auto_affi->mode = 0; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_dec(); + sched_grid_zone_update(false); + mutex_unlock(&smart_grid_used_mutex); +} + +static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + unsigned long load, avg_load, runnable_load; + int i; + + avg_load = 0; + runnable_load = 0; + + for_each_cpu(i, sched_group_span(group)) { + load = cpu_runnable(cpu_rq(i)); + runnable_load += load; + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + } + + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + + if (min_runnable_load > (runnable_load + imbalance)) { + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + min_avg_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + return idlest ? idlest : group; +} + +static int group_find_idlest_cpu(struct sched_group *group) +{ + int least_loaded_cpu = cpumask_first(sched_group_span(group)); + unsigned long load, min_load = ULONG_MAX; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int shallowest_idle_cpu = -1; + int i; + + if (group->group_weight == 1) + return least_loaded_cpu; + + for_each_cpu(i, sched_group_span(group)) { + if (sched_idle_cpu(i)) + return i; + + if (available_idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + + if (idle && idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || + idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = cpu_runnable(cpu_rq(i)); + if (load < min_load) { + min_load = load; + least_loaded_cpu = i; + } + } + } + + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : + least_loaded_cpu; +} + +void free_affinity_domains(struct affinity_domain *ad) +{ + int i; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + kfree(ad->domains[i]); + kfree(ad->domains_orig[i]); + ad->domains[i] = NULL; + ad->domains_orig[i] = NULL; + } + ad->dcount = 0; +} + +static int init_affinity_domains_orig(struct affinity_domain *ad) +{ + int i, j; + + for (i = 0; i < ad->dcount; i++) { + ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains_orig[i]) + goto err; + + cpumask_copy(ad->domains_orig[i], ad->domains[i]); + } + + return 0; +err: + for (j = 0; j < i; j++) { + kfree(ad->domains_orig[j]); + ad->domains_orig[j] = NULL; + } + return -ENOMEM; +} + +static int init_affinity_domains(struct affinity_domain *ad) +{ + struct sched_domain *sd = NULL, *tmp; + struct sched_group *idlest = NULL; + int ret = -ENOMEM; + int dcount = 0; + int i = 0; + int cpu; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains[i]) + goto err; + } + + rcu_read_lock(); + cpu = cpumask_first_and(cpu_active_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + for_each_domain(cpu, tmp) { + sd = tmp; + dcount++; + } + + if (!sd || dcount > AD_LEVEL_MAX) { + rcu_read_unlock(); + ret = -EINVAL; + goto err; + } + + idlest = sd_find_idlest_group(sd); + cpu = group_find_idlest_cpu(idlest); + i = 0; + for_each_domain(cpu, tmp) { + cpumask_copy(ad->domains[i], sched_domain_span(tmp)); + __schedstat_set(ad->stay_cnt[i], 0); + i++; + } + rcu_read_unlock(); + + ad->dcount = dcount; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + ad->domain_mask = (1 << ad->dcount) - 1; + + ret = init_affinity_domains_orig(ad); + if (ret) + goto err; + + return 0; +err: + free_affinity_domains(ad); + return ret; +} + +int init_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi; + int ret; + + /* No need init auto affinity when smart grid disabled */ + if (!smart_grid_enabled()) + return 0; + + auto_affi = kzalloc(sizeof(*auto_affi), GFP_KERNEL); + if (!auto_affi) + return -ENOMEM; + + raw_spin_lock_init(&auto_affi->lock); + auto_affi->mode = 0; + auto_affi->period_active = 0; + auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); + hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + auto_affi->period_timer.function = sched_auto_affi_period_timer; + + ret = init_affinity_domains(&auto_affi->ad); + if (ret) { + kfree(auto_affi); + if (ret == -EINVAL) { + ret = 0; + pr_warn("init affinity domain fail\n"); + } + return ret; + } + + auto_affi->tg = tg; + tg->auto_affinity = auto_affi; + INIT_LIST_HEAD(&auto_affi->af_list); + sched_grid_zone_add_af(auto_affi); + return 0; +} + +static void destroy_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (!smart_grid_enabled()) + return; + + if (unlikely(!auto_affi)) + return; + + if (auto_affi->period_active) + smart_grid_usage_dec(); + + hrtimer_cancel(&auto_affi->period_timer); + sched_grid_zone_del_af(auto_affi); + free_affinity_domains(&auto_affi->ad); + + kfree(tg->auto_affinity); + tg->auto_affinity = NULL; +} +#else +static void destroy_auto_affinity(struct task_group *tg) {} + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static inline bool prefer_cpus_valid(struct task_struct *p); + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + return p->prefer_cpus; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + return 0; +} +#endif /* CONFIG_QOS_SCHED_DYNAMIC_AFFINITY */ +#endif /* CONFIG_QOS_SCHED_SMART_GRID */ + /************************************************** * CFS operations on tasks: */ @@ -8362,13 +8862,16 @@ __setup("dynamic_affinity", dynamic_affinity_switch_setup); static inline bool prefer_cpus_valid(struct task_struct *p) { + struct cpumask *prefer_cpus; + if (!dynamic_affinity_enabled()) return false; - return p->prefer_cpus && - !cpumask_empty(p->prefer_cpus) && - !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && - cpumask_subset(p->prefer_cpus, p->cpus_ptr); + prefer_cpus = task_prefer_cpus(p); + + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); } static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -8402,13 +8905,24 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, long min_util = INT_MIN; struct task_group *tg; long spare; - int cpu; + int cpu, mode; p->select_cpus = p->cpus_ptr; - if (!prefer_cpus_valid(p)) + rcu_read_lock(); + mode = dynamic_affinity_mode(p); + if (mode == -1) { + rcu_read_unlock(); + return; + } else if (mode == 1) { + p->select_cpus = task_prefer_cpus(p); + if (idlest_cpu) + *idlest_cpu = cpumask_first(p->select_cpus); + sched_qos_affinity_set(p); + rcu_read_unlock(); return; + } - rcu_read_lock(); + /* manual mode */ tg = task_group(p); for_each_cpu(cpu, p->prefer_cpus) { if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) { @@ -8472,7 +8986,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) time = schedstat_start_time(); /* - * required for stable ->cpus_allowed + * required for stable ->cpus_ptr */ lockdep_assert_held(&p->pi_lock); @@ -14046,7 +14560,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - int i; + int i, ret; tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) @@ -14058,6 +14572,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); + ret = init_auto_affinity(tg); + if (ret) + goto err; for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -14080,6 +14597,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) err_free_rq: kfree(cfs_rq); err: + destroy_auto_affinity(tg); return 0; } @@ -14108,6 +14626,7 @@ void unregister_fair_sched_group(struct task_group *tg) int cpu; destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + destroy_auto_affinity(tg); for_each_possible_cpu(cpu) { if (tg->se[cpu]) diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..82f2a09c3c309e4185370d7eaf572f8a4ec2ab0f --- /dev/null +++ b/kernel/sched/grid/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += qos.o power.o stat.o diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..743f72aaffbfc27a157b232cc00b303c566099d6 --- /dev/null +++ b/kernel/sched/grid/internal.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H +#define _LINUX_SCHED_SMART_GRID_INTERNAL_H +void qos_power_init(struct sched_grid_qos_power *power); +void qos_stat_init(struct sched_grid_qos_stat *stat); +#endif diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c new file mode 100644 index 0000000000000000000000000000000000000000..f916cd3801ad73964f286e9997191848467c98ab --- /dev/null +++ b/kernel/sched/grid/power.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for QOS-aware smart grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include "internal.h" + +void qos_power_init(struct sched_grid_qos_power *power) +{ + power->cpufreq_sense_ratio = 0; + power->target_cpufreq = 0; + power->cstate_sense_ratio = 0; +} diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c new file mode 100644 index 0000000000000000000000000000000000000000..7c4cb867b60b3d72b93a84ad02e3dbbc264bfcfa --- /dev/null +++ b/kernel/sched/grid/qos.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for Smart Grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include <../kernel/sched/sched.h> + +static inline int qos_affinity_set(struct task_struct *p) +{ + int n; + struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity; + + if (likely(affinity->prefer_cpus == p->select_cpus)) + return 0; + + /* + * We want the memory allocation to be as close to the CPU + * as possible, and adjust after getting memory bandwidth usage. + */ + for (n = 0; n < nr_node_ids; n++) { + if (cpumask_intersects(cpumask_of_node(n), p->select_cpus)) + node_set(n, affinity->mem_preferred_node_mask); + else + node_clear(n, affinity->mem_preferred_node_mask); + } + + affinity->prefer_cpus = p->select_cpus; + return 0; +} + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig) +{ + struct sched_grid_qos *qos; + + qos = kzalloc(sizeof(*qos), GFP_KERNEL); + if (!qos) + return -ENOMEM; + + qos_power_init(&qos->power); + qos_stat_init(&qos->stat); + + nodes_clear(qos->affinity.mem_preferred_node_mask); + if (likely(orig->grid_qos)) + qos->affinity = orig->grid_qos->affinity; + qos->affinity_set = qos_affinity_set; + p->grid_qos = qos; + + return 0; +} + +void sched_grid_qos_free(struct task_struct *p) +{ + kfree(p->grid_qos); + p->grid_qos = NULL; +} + +/* dynamic select a more appropriate preferred interleave nid for process */ +int sched_grid_preferred_interleave_nid(struct mempolicy *policy) +{ +#ifndef CONFIG_NUMA + return NUMA_NO_NODE; +#else + nodemask_t nmask; + unsigned int next; + struct task_struct *me = current; + nodemask_t *preferred_nmask = NULL; + + if (likely(me->grid_qos)) + preferred_nmask = + &me->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask || !policy) + return NUMA_NO_NODE; + + if (nodes_equal(policy->nodes, *preferred_nmask)) + return NUMA_NO_NODE; + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred interleave nid in + * more appropriate range. + */ + nodes_and(nmask, policy->nodes, *preferred_nmask); + if (nodes_empty(nmask)) + return NUMA_NO_NODE; + + next = next_node_in(me->il_prev, nmask); + if (next < MAX_NUMNODES) + me->il_prev = next; + return next; +#endif +} + +/* dynamic select a more appropriate preferred nid for process */ +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) +{ + int nd = preferred_nid; + nodemask_t nmask, ndmask; + nodemask_t *preferred_nmask = NULL; + + if (likely(current->grid_qos)) + preferred_nmask = + ¤t->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask) + return preferred_nid; + + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred nid in more appropriate + * range. + */ + nmask = *preferred_nmask; + if (nodemask) { + if (nodes_equal(*nodemask, nmask)) + return preferred_nid; + + nodes_and(nmask, nmask, *nodemask); + } + + if (node_isset(preferred_nid, nmask)) + return preferred_nid; + + /* + * We prefer the numa node we're running, if there is no limit + * to nodemask, we select preferred nid in preferred range or + * in restriced range if not. + */ + init_nodemask_of_node(&ndmask, numa_node_id()); + nodes_and(ndmask, nmask, ndmask); + if (!nodes_empty(ndmask)) + nd = first_node(ndmask); + else if (!nodes_empty(nmask)) + nd = first_node(nmask); + + return nd; +} + +static struct sched_grid_zone sg_zone; + +int __init sched_grid_zone_init(void) +{ + int index; + + for (index = 0; index < SMART_GRID_ZONE_NR; index++) + cpumask_clear(&sg_zone.cpus[index]); + + raw_spin_lock_init(&sg_zone.lock); + INIT_LIST_HEAD(&sg_zone.af_list_head); + return 0; +} + +int sched_grid_zone_update(bool is_locked) +{ + struct list_head *pos; + struct auto_affinity *af_pos; + unsigned long flags; + + if (!is_locked) + raw_spin_lock_irqsave(&sg_zone.lock, flags); + + cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_HOT]); + cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_WARM]); + + list_for_each(pos, &sg_zone.af_list_head) { + af_pos = list_entry(pos, struct auto_affinity, af_list); + + /* when smart_grid not used we need calculate all task_group */ + /* when smart_grid used we only calculate enabled task_group */ + if (smart_grid_used() && af_pos->mode == 0) + continue; + + cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_HOT], &sg_zone.cpus[SMART_GRID_ZONE_HOT], + af_pos->ad.domains[af_pos->ad.curr_level]); + /* Update warm zone CPUs to max level first */ + cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_WARM], &sg_zone.cpus[SMART_GRID_ZONE_WARM], + af_pos->ad.domains[af_pos->ad.dcount - 1]); + } + + /* Then reset warm zone CPUs without hot zone CPUs */ + cpumask_andnot(&sg_zone.cpus[SMART_GRID_ZONE_WARM], &sg_zone.cpus[SMART_GRID_ZONE_WARM], + &sg_zone.cpus[SMART_GRID_ZONE_HOT]); + + if (!is_locked) + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + + cpufreq_smart_grid_start_sync(); + return 0; +} + +int sched_grid_zone_add_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_add_tail(&af->af_list, &sg_zone.af_list_head); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +int sched_grid_zone_del_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_del(&af->af_list); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone) +{ + if (zone >= SMART_GRID_ZONE_NR) + return NULL; + + return &sg_zone.cpus[zone]; +} + +/* + * Default smart_grid strategy was disable (=0). + * But, considering for inheritance of the pre-verion code. + * We make all the task to the highest qos_level (class_lvl = 0), + * when smart_grid strategy was disabled. + * Otherwise, When smart_grid strategy was enabled, we use the task's + * actually class_lvl. + */ +unsigned int sysctl_smart_grid_strategy_ctrl; + +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p) +{ + struct affinity_domain *ad; + enum sg_zone_type current_zone; + + ad = &task_group(p)->auto_affinity->ad; + /* + * when smart_grid strategy was disabled, + * We make all the task to the highest qos_level (class_lvl = 0) + */ + if (sysctl_smart_grid_strategy_ctrl == 0) + return ad->domains[ad->curr_level]; + + /* Only place the highest level task into hot zone */ + current_zone = p->grid_qos->stat.class_lvl == SCHED_GRID_QOS_TASK_LEVEL_HIGHEST ? + SMART_GRID_ZONE_HOT : SMART_GRID_ZONE_WARM; + + /* Place the highest level task in current domain level itself */ + if (current_zone == SMART_GRID_ZONE_HOT) + return ad->domains[ad->curr_level]; + + return &sg_zone.cpus[current_zone]; +} diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c new file mode 100644 index 0000000000000000000000000000000000000000..68bbc060b8110e17381ade5b8cf9d2c340360945 --- /dev/null +++ b/kernel/sched/grid/stat.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for QOS-aware smart grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include "internal.h" + +static int qos_stat_set_class_level(struct sched_grid_qos_stat *qos_stat, int level) +{ + if (qos_stat == NULL || level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) + return -EINVAL; + + qos_stat->class_lvl = level; + return 0; +} + +void qos_stat_init(struct sched_grid_qos_stat *stat) +{ + if (stat == NULL) + return; + + stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips"; + stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio"; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index = + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width"; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index = + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX; + + stat->set_class_lvl = qos_stat_set_class_level; + stat->class_lvl = SCHED_GRID_QOS_TASK_LEVEL_DEFAULT; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 19fe3c72e3fa72bfbdab19bfe22c4870f047bab5..51be327fcc6ded9add1ad952bf68038565cf528c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,6 +362,35 @@ struct cfs_bandwidth { #endif }; + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AD_LEVEL_MAX 8 + +struct affinity_domain { + int dcount; + int curr_level; + u32 domain_mask; +#ifdef CONFIG_SCHEDSTATS + u64 stay_cnt[AD_LEVEL_MAX]; +#endif + struct cpumask *domains[AD_LEVEL_MAX]; + struct cpumask *domains_orig[AD_LEVEL_MAX]; +}; +#endif + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +struct auto_affinity { + raw_spinlock_t lock; + u64 mode; + ktime_t period; + struct hrtimer period_timer; + int period_active; + struct affinity_domain ad; + struct task_group *tg; + struct list_head af_list; +}; +#endif + /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -418,7 +447,9 @@ struct task_group { /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif - +#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) + struct auto_affinity *auto_affinity; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -485,6 +516,21 @@ extern void sched_release_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern void start_auto_affinity(struct auto_affinity *auto_affi); +extern void stop_auto_affinity(struct auto_affinity *auto_affi); +extern int init_auto_affinity(struct task_group *tg); +extern void tg_update_affinity_domains(int cpu, int online); + +#else +static inline int init_auto_affinity(struct task_group *tg) +{ + return 0; +} + +static inline void tg_update_affinity_domains(int cpu, int online) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b23a239de750b83c191f6888e0a908c78656906d..3c50d2c0cd96f4e1d387db85ef1ac9aa8acf566d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -2202,7 +2203,13 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page; unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + if (smart_grid_used()) { + nid = sched_grid_preferred_interleave_nid(pol); + nid = (nid == NUMA_NO_NODE) ? + interleave_nid(pol, vma, addr, PAGE_SHIFT + order) : nid; + } else { + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + } mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); @@ -2267,6 +2274,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); + if (smart_grid_used()) + preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask); folio = __folio_alloc(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: