diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index ea2452d5576d6b5f87ae28e0c2570178d9aed2af..a88d2da1fd6f99373807a4a1a5848dfd2af739c8 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -142,6 +142,7 @@ CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y CONFIG_QOS_SCHED_MULTILEVEL=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +CONFIG_QOS_SCHED_SMART_GRID=y CONFIG_QOS_SCHED_SMT_EXPELLER=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_QOS_SCHED_PRIO_LB=y diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 70f7a638b6db9a2548456cc1d8610fe6ad4c6cf0..969569a3facfaff25417fc647bf4282f7078d101 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2689,6 +2689,237 @@ int cpufreq_boost_enabled(void) } EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); +#ifdef CONFIG_QOS_SCHED_SMART_GRID + +struct smart_grid_zone { + char governor_name[SMART_GRID_ZONE_NR][CPUFREQ_NAME_LEN]; + unsigned int enable; + struct irq_work irq_work; + struct work_struct work; + unsigned int is_init; +}; + +static struct smart_grid_zone sg_zone; +static DEFINE_MUTEX(sg_zone_lock); + +#define SG_WRITE_BUFF_LEN 30 + +void cpufreq_smart_grid_start_sync(void) +{ + if (likely(sg_zone.is_init)) + irq_work_queue(&sg_zone.irq_work); +} + +static ssize_t show_smart_grid_governor(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int len = 0; + int gov_index; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return sprintf(buf, "smart_grid governor disable\n"); + } + + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) + len += sprintf(buf + len, "smart_grid-%d: %s\n", gov_index, + sg_zone.governor_name[gov_index]); + + mutex_unlock(&sg_zone_lock); + return len; +} + +static ssize_t store_smart_grid_governor(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct cpufreq_governor *target_gov = NULL; + unsigned int current_level; + char *level_string = NULL; + char buf_string[SG_WRITE_BUFF_LEN]; + char *gov_string = buf_string; + char save_string[CPUFREQ_NAME_LEN]; + int ret; + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + ret = -EINVAL; + goto fail; + } + + if (strscpy(buf_string, buf, SG_WRITE_BUFF_LEN) <= 0) { + ret = -EINVAL; + goto fail; + } + + level_string = strsep(&gov_string, "-"); + if (level_string == NULL) { + ret = -EINVAL; + goto fail; + } + + if (kstrtouint(level_string, 10, ¤t_level)) { + ret = -EINVAL; + goto fail; + } + + if (current_level >= SMART_GRID_ZONE_NR) { + ret = -EINVAL; + goto fail; + } + + if (sscanf(gov_string, "%15s", save_string) != 1) { + ret = -EINVAL; + goto fail; + } + + target_gov = cpufreq_parse_governor(save_string); + if (target_gov == NULL) { + ret = -EINVAL; + goto fail; + } + module_put(target_gov->owner); + + strscpy(sg_zone.governor_name[current_level], save_string, CPUFREQ_NAME_LEN); + cpufreq_smart_grid_start_sync(); + mutex_unlock(&sg_zone_lock); + return count; + +fail: + mutex_unlock(&sg_zone_lock); + return ret; +} +define_one_global_rw(smart_grid_governor); + +static ssize_t show_smart_grid_governor_enable(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sg_zone.enable); +} + +static void smart_grid_irq_work(struct irq_work *irq_work) +{ + struct smart_grid_zone *zone; + + zone = container_of(irq_work, struct smart_grid_zone, irq_work); + schedule_work_on(smp_processor_id(), &zone->work); +} + +static void smart_grid_work_handler(struct work_struct *work) +{ + struct smart_grid_zone *zone; + struct cpufreq_governor *target_gov = NULL; + struct cpufreq_policy *policy = NULL; + unsigned int cpu; + int gov_index; + + zone = container_of(work, struct smart_grid_zone, work); + + mutex_lock(&sg_zone_lock); + if (!sg_zone.enable) { + mutex_unlock(&sg_zone_lock); + return; + } + + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) { + target_gov = cpufreq_parse_governor(sg_zone.governor_name[gov_index]); + if (target_gov == NULL) + continue; + + for_each_cpu(cpu, sched_grid_zone_cpumask(gov_index)) { + if (cpu_is_offline(cpu)) + continue; + + policy = cpufreq_cpu_acquire(cpu); + if (policy == NULL) + continue; + + if (policy->governor == target_gov) { + cpufreq_cpu_release(policy); + continue; + } + /*Try to switch governor */ + store_scaling_governor(policy, sg_zone.governor_name[gov_index], + CPUFREQ_NAME_LEN); + cpufreq_cpu_release(policy); + } + module_put(target_gov->owner); + } + mutex_unlock(&sg_zone_lock); +} + +static void sg_zone_set_enable(void) +{ + int gov_index; + + /* Set default smart_grid governor */ + for (gov_index = 0; gov_index < SMART_GRID_ZONE_NR; gov_index++) { + if (!gov_index) + strscpy(sg_zone.governor_name[gov_index], "performance", CPUFREQ_NAME_LEN); + else + strscpy(sg_zone.governor_name[gov_index], "powersave", CPUFREQ_NAME_LEN); + } + + sg_zone.enable = 1; + cpufreq_smart_grid_start_sync(); +} + +static void sg_zone_set_disable(void) +{ + irq_work_sync(&sg_zone.irq_work); + cancel_work_sync(&sg_zone.work); + sg_zone.enable = 0; +} + +static ssize_t store_smart_grid_governor_enable(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int enable; + + if (kstrtouint(buf, 10, &enable)) + return -EINVAL; + + if (enable > 1) + return -EINVAL; + + mutex_lock(&sg_zone_lock); + if (sg_zone.enable == enable) { + mutex_unlock(&sg_zone_lock); + return -EINVAL; + } + + if (enable) + sg_zone_set_enable(); + else + sg_zone_set_disable(); + + mutex_unlock(&sg_zone_lock); + return count; +} +define_one_global_rw(smart_grid_governor_enable); + +static int create_smart_grid_sysfs_file(void) +{ + int ret; + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor sysfs file\n", + __func__); + + ret = sysfs_create_file(cpufreq_global_kobject, &smart_grid_governor_enable.attr); + if (ret) + pr_err("%s: cannot register global smart_grid_governor_enable sysfs file\n", + __func__); + + init_irq_work(&sg_zone.irq_work, smart_grid_irq_work); + INIT_WORK(&sg_zone.work, smart_grid_work_handler); + sg_zone.enable = 0; + sg_zone.is_init = 1; + return ret; +} +#endif + /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ @@ -2861,6 +3092,9 @@ static int __init cpufreq_core_init(void) if (!strlen(default_governor)) strncpy(default_governor, gov->name, CPUFREQ_NAME_LEN); +#ifdef CONFIG_QOS_SCHED_SMART_GRID + create_smart_grid_sysfs_file(); +#endif return 0; } module_param(off, int, 0444); diff --git a/fs/proc/array.c b/fs/proc/array.c index 18a4588c35be632449258c2fc5d0e6f4ccd5effd..989f7602035c7bf3da3ad39e478af404ec0886be 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -389,6 +389,16 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(task->cpus_ptr)); } +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_preferred:\t%*pb\n", + cpumask_pr_args(task->prefer_cpus)); + seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", + cpumask_pr_args(task->prefer_cpus)); +} +#endif + static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); @@ -427,6 +437,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + task_cpus_preferred(m, task); +#endif return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 2a4cc5c796c74ca5cd826fb1f668d39c4fd02f56..e9a5550fbbe28682621fa25bb8524687d7be1579 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -93,6 +93,10 @@ #include #include #include +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#include +#endif #include #include #include @@ -3493,6 +3497,75 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_STACKLEAK_METRICS */ +#ifdef CONFIG_QOS_SCHED_SMART_GRID +static int smart_grid_level_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->grid_qos != NULL) + seq_printf(m, "%d\n", p->grid_qos->stat.class_lvl); + + put_task_struct(p); + + return 0; +} + +static int smart_grid_level_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, smart_grid_level_show, inode); +} + +static ssize_t smart_grid_level_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned long long level = SCHED_GRID_QOS_TASK_LEVEL_MAX; + unsigned int len; + int ret = 0; + + memset(buffer, 0, sizeof(buffer)); + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + len = _parse_integer(buffer, 10, &level); + if (len & KSTRTOX_OVERFLOW || + level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) { + put_task_struct(p); + return -EINVAL; + } + + if (p->grid_qos != NULL && + p->grid_qos->stat.set_class_lvl != NULL) + ret = p->grid_qos->stat.set_class_lvl(&p->grid_qos->stat, level); + + put_task_struct(p); + + if (ret) + return ret; + return count; +} + +static const struct file_operations proc_pid_sg_level_operations = { + .open = smart_grid_level_open, + .read = seq_read, + .write = smart_grid_level_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Thread groups */ @@ -3516,6 +3589,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + REG("smart_grid_level", 0644, proc_pid_sg_level_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index acbad3b363226efa0bd6dd45514cc1490edb2813..b715828cf3576dccb87c605cc14af4f7fd139914 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -16,6 +16,9 @@ #include #include #include +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#endif /********************************************************************* * CPUFREQ INTERFACE * @@ -606,6 +609,14 @@ int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); int cpufreq_start_governor(struct cpufreq_policy *policy); void cpufreq_stop_governor(struct cpufreq_policy *policy); +#ifdef CONFIG_QOS_SCHED_SMART_GRID +/* Implement in cpufreq.c */ +#ifdef CONFIG_CPU_FREQ +void cpufreq_smart_grid_start_sync(void); +#else +static inline void cpufreq_smart_grid_start_sync(void) { return; } +#endif +#endif #define cpufreq_governor_init(__governor) \ static int __init __governor##_init(void) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d39427f8044d3a5a4b57f4ee9e7aefbe82b3c4a5..7e5af98957d964008caf580108def488a58978ba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1449,7 +1449,16 @@ struct task_struct { KABI_RESERVE(10) KABI_RESERVE(11) #endif + +#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_SMART_GRID) + struct sched_grid_qos *grid_qos; +#else KABI_RESERVE(12) +#endif +#else + KABI_RESERVE(12) +#endif KABI_RESERVE(13) KABI_RESERVE(14) KABI_RESERVE(15) @@ -2255,6 +2264,19 @@ void sched_prefer_cpus_free(struct task_struct *p); void dynamic_affinity_enable(void); #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern struct static_key __smart_grid_used; +static inline bool smart_grid_used(void) +{ + return static_key_false(&__smart_grid_used); +} +#else +static inline bool smart_grid_used(void) +{ + return false; +} +#endif + #ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h new file mode 100644 index 0000000000000000000000000000000000000000..fe334355bb4923ab941f02ef8ba6bc4eb74d2798 --- /dev/null +++ b/include/linux/sched/grid_qos.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_GRID_QOS_H +#define _LINUX_SCHED_GRID_QOS_H +#include +#include + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +enum sched_grid_qos_class { + SCHED_GRID_QOS_CLASS_LEVEL_1 = 0, + SCHED_GRID_QOS_CLASS_LEVEL_2 = 1, + SCHED_GRID_QOS_CLASS_LEVEL_3 = 2, + SCHED_GRID_QOS_CLASS_LEVEL_4 = 3, + SCHED_GRID_QOS_CLASS_LEVEL_5 = 4, + SCHED_GRID_QOS_CLASS_LEVEL_6 = 5, + SCHED_GRID_QOS_CLASS_LEVEL_7 = 6, + SCHED_GRID_QOS_CLASS_LEVEL_8 = 7, + SCHED_GRID_QOS_CLASS_LEVEL_NR +}; + +/* + * SCHED_GRID_QOS_TASK_LEVEL was defined different QoS level. + * The lower number has the higher priority. (E.g. 0 was the highest) + * The enum sched_grid_qos_class defined the max level, the lowest level. + */ +#define SCHED_GRID_QOS_TASK_LEVEL_HIGHEST SCHED_GRID_QOS_CLASS_LEVEL_1 +#define SCHED_GRID_QOS_TASK_LEVEL_MAX (SCHED_GRID_QOS_CLASS_LEVEL_NR) +#define SCHED_GRID_QOS_TASK_LEVEL_DEFAULT (SCHED_GRID_QOS_CLASS_LEVEL_NR - 1) + +enum { + SCHED_GRID_QOS_IPS_INDEX = 0, + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1, + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2, + SCHED_GRID_QOS_SAMPLE_NR +}; + +#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100 + +struct sched_grid_qos_ring_buffer { + u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN]; + unsigned int head; + void (*push)(u64 *data, int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); +}; + +struct sched_grid_qos_sample { + const char *name; + int index; + int sample_bypass; + int sample_times; + struct sched_grid_qos_ring_buffer ring_buffer; + u64 pred_target[MAX_NUMNODES]; + void (*cal_target)(int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); + + int account_ready; + int (*start)(void *arg); + int (*account)(void *arg); +}; + +struct sched_grid_qos_stat { + enum sched_grid_qos_class class_lvl; + int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat, int level); + struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR]; +}; + +struct sched_grid_qos_power { + int cpufreq_sense_ratio; + int target_cpufreq; + int cstate_sense_ratio; +}; + +struct sched_grid_qos_affinity { + nodemask_t mem_preferred_node_mask; + const struct cpumask *prefer_cpus; +}; + +struct task_struct; +struct sched_grid_qos { + struct sched_grid_qos_stat stat; + struct sched_grid_qos_power power; + struct sched_grid_qos_affinity affinity; + + int (*affinity_set)(struct task_struct *p); +}; + +static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return p->grid_qos->affinity_set(p); +} + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig); +void sched_grid_qos_free(struct task_struct *p); + +int sched_grid_preferred_interleave_nid(struct mempolicy *policy); +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask); + +enum sg_zone_type { + SMART_GRID_ZONE_HOT = 0, + SMART_GRID_ZONE_WARM, + SMART_GRID_ZONE_NR +}; + +struct auto_affinity; +struct sched_grid_zone { + raw_spinlock_t lock; + struct cpumask cpus[SMART_GRID_ZONE_NR]; + struct list_head af_list_head; /* struct auto_affinity list head */ +}; + +int __init sched_grid_zone_init(void); +int sched_grid_zone_update(bool is_locked); +int sched_grid_zone_add_af(struct auto_affinity *af); +int sched_grid_zone_del_af(struct auto_affinity *af); +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone); +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p); +#else +static inline int __init sched_grid_zone_init(void) { return 0; } + +static inline int +sched_grid_preferred_interleave_nid(struct mempolicy *policy) +{ + return NUMA_NO_NODE; +} +static inline int +sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) +{ + return preferred_nid; +} + +static inline int sched_qos_affinity_set(struct task_struct *p) +{ + return 0; +} +#endif +#endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4d6bbc0934c9ed8534d291d586192aaa6e4ec587..5cd5b3c579d3735bfb8109f57bfb590dc59b3359 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -35,6 +35,11 @@ extern unsigned int sysctl_sched_child_runs_first; extern int sysctl_sched_util_low_pct; #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern unsigned int sysctl_smart_grid_strategy_ctrl; +extern int sysctl_affinity_adjust_delay_ms; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index 83714edd7bf9f2db51b76c36dad71ede11ba6006..bb906380755664fef16db77c7f5bde0e889eae7a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1078,6 +1078,19 @@ config UCLAMP_TASK_GROUP If in doubt, say N. +config QOS_SCHED_SMART_GRID + bool "qos smart grid scheduler" + depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY + default n + help + This feature is used for power consumption tuning in server scenario. + This can be divided into the following aspects: + 1. User interface, manage user needs. + 2. Collect tasks' features to ensure key tasks' QOS. + 3. Weaken the influence the impact of CPU frequency and cpuidle + adjustment on tasks. + 4. Docking EAS (Energy Aware Scheduling) model. + config CGROUP_PIDS bool "PIDs controller" help diff --git a/kernel/fork.c b/kernel/fork.c index a531fd38d111d4df36e44215babe011435199eef..e495ae00b0a95acfe302cf1c03448a1b1b37a2ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,7 +97,9 @@ #include #include #include - +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include +#endif #include #include #include @@ -470,6 +472,9 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY sched_prefer_cpus_free(tsk); +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + sched_grid_qos_free(tsk); #endif free_task_struct(tsk); } @@ -2063,7 +2068,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY retval = sched_prefer_cpus_fork(p, current->prefer_cpus); if (retval) - goto bad_fork_free; + goto bad_fork_cleanup_count; #endif lockdep_assert_irqs_enabled(); @@ -2083,6 +2088,12 @@ static __latent_entropy struct task_struct *copy_process( if (retval < 0) goto bad_fork_free; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + retval = sched_grid_qos_fork(p, current); + if (retval) + goto bad_fork_cleanup_count; +#endif + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 6f3106774d055ba3568c43afc86d0a8a84e8e405..a6fe0ee09917a9a81a9ecaf2e73f16bd616400fd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -39,3 +39,4 @@ obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c8c946ee1de223cb5c84a770abbeeefccd7f9fb..4bd0a7b8b22dfefd1ae478d26ac69957db76247e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -23,7 +23,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" - +#include #include "pelt.h" #include "smp.h" @@ -8009,6 +8009,7 @@ int sched_cpu_activate(unsigned int cpu) static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); + tg_update_affinity_domains(cpu, 1); if (sched_smp_initialized) { sched_domains_numa_masks_set(cpu); @@ -8071,6 +8072,7 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); + tg_update_affinity_domains(cpu, 0); return 0; } @@ -8140,6 +8142,9 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_smp_initialized = true; + + sched_grid_zone_init(); + init_auto_affinity(&root_task_group); } static int __init migration_init(void) @@ -9512,6 +9517,138 @@ static inline s64 cpu_smt_expell_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (unlikely(!auto_affi)) + return -EPERM; + + /* auto mode*/ + if (mode == 1) { + start_auto_affinity(auto_affi); + } else if (mode == 0) { + stop_auto_affinity(auto_affi); + } else { + return -EINVAL; + } + + return 0; +} + +static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->mode; +} + +static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mode) +{ + return tg_set_dynamic_affinity_mode(css_tg(css), mode); +} + +int tg_set_affinity_period(struct task_group *tg, u64 period_ms) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + if (!period_ms || period_ms > U64_MAX / NSEC_PER_MSEC) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + tg->auto_affinity->period = ms_to_ktime(period_ms); + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +u64 tg_get_affinity_period(struct task_group *tg) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return ktime_to_ms(tg->auto_affinity->period); +} + +static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period) +{ + return tg_set_affinity_period(css_tg(css), period); +} + +static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_affinity_period(css_tg(css)); +} + +static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 mask) +{ + struct task_group *tg = css_tg(css); + struct affinity_domain *ad; + u16 full; + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + ad = &tg->auto_affinity->ad; + full = (1 << ad->dcount) - 1; + if (mask > full) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + ad->domain_mask = mask; + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->ad.domain_mask; +} + +static int cpu_affinity_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int i; + + if (unlikely(!auto_affi)) + return -EPERM; + + ad = &auto_affi->ad; + seq_printf(sf, "period_active %d\n", auto_affi->period_active); + seq_printf(sf, "dcount %d\n", ad->dcount); + seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); + seq_printf(sf, "curr_level %d\n", ad->curr_level); + seq_printf(sf, "zone hot %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_HOT))); + seq_printf(sf, "zone warm %*pbl\n", + cpumask_pr_args(sched_grid_zone_cpumask(SMART_GRID_ZONE_WARM))); + for (i = 0; i < ad->dcount; i++) + seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", + i, cpumask_pr_args(ad->domains[i]), + schedstat_val(ad->stay_cnt[i])); + + return 0; +} +#endif /* CONFIG_QOS_SCHED_SMART_GRID */ + #ifdef CONFIG_QOS_SCHED static int tg_change_scheduler(struct task_group *tg, void *data) { @@ -9673,6 +9810,27 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_shares_write_u64, }, #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .name = "dynamic_affinity_mode", + .read_u64 = cpu_affinity_mode_read_u64, + .write_u64 = cpu_affinity_mode_write_u64, + }, + { + .name = "affinity_period_ms", + .read_u64 = cpu_affinity_period_read_uint, + .write_u64 = cpu_affinity_period_write_uint, + }, + { + .name = "affinity_domain_mask", + .read_u64 = cpu_affinity_domain_mask_read_u64, + .write_u64 = cpu_affinity_domain_mask_write_u64, + }, + { + .name = "affinity_stat", + .seq_show = cpu_affinity_stat_show, + }, +#endif #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_quota_us", diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae0868756819d096717422e67ccda7998..a9712bdeafa34eaa0b16393661ee39e9ababe386 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,6 +12,7 @@ #include #include +#include #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ab0296e7be2f4c1f5c1177522ae064dd5044c71..f5f3a104504aef84b3e56c2c0abf81115af3b2dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,7 @@ #include #include #endif +#include #include /* @@ -5810,6 +5811,460 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 2000 +#define IS_DOMAIN_SET(level, mask) ((1 << (level)) & (mask)) + +static DEFINE_MUTEX(smart_grid_used_mutex); + +static inline unsigned long cpu_util(int cpu); +static unsigned long capacity_of(int cpu); +static int sched_idle_cpu(int cpu); +static unsigned long cpu_runnable(struct rq *rq); +static inline bool prefer_cpus_valid(struct task_struct *p); + +int sysctl_affinity_adjust_delay_ms = 5000; + +struct static_key __smart_grid_used; + +static void smart_grid_usage_inc(void) +{ + static_key_slow_inc(&__smart_grid_used); +} + +static void smart_grid_usage_dec(void) +{ + static_key_slow_dec(&__smart_grid_used); +} + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + if (!smart_grid_used()) + return p->prefer_cpus; + + if (task_group(p)->auto_affinity->mode == 0) + return (void *)p->cpus_ptr; + + return sched_grid_prefer_cpus(p); +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + if (smart_grid_used()) + return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; + + return 0; +} + +static void affinity_domain_up(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level >= ad->dcount - 1) + return; + + while (level < ad->dcount) { + if (IS_DOMAIN_SET(level + 1, ad->domain_mask) && + cpumask_weight(ad->domains[level + 1]) > 0) { + ad->curr_level = level + 1; + sched_grid_zone_update(false); + return; + } + level++; + } +} + +static void affinity_domain_down(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level <= 0) + return; + + while (level > 0) { + if (!cpumask_weight(ad->domains[level - 1])) + return; + + if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { + ad->curr_level = level - 1; + sched_grid_zone_update(false); + return; + } + level--; + } +} + +static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) +{ + struct auto_affinity *auto_affi = + container_of(timer, struct auto_affinity, period_timer); + struct task_group *tg = auto_affi->tg; + struct affinity_domain *ad = &auto_affi->ad; + struct cpumask *span = ad->domains[ad->curr_level]; + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned long flags; + int cpu; + + for_each_cpu(cpu, span) { + util_avg_sum += cpu_util(cpu); + tg_capacity += capacity_of(cpu); + } + + raw_spin_lock_irqsave(&auto_affi->lock, flags); + if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) { + affinity_domain_up(tg); + } else if (util_avg_sum * 100 < tg_capacity * + sysctl_sched_util_low_pct / 2) { + affinity_domain_down(tg); + } + + schedstat_inc(ad->stay_cnt[ad->curr_level]); + + hrtimer_forward_now(timer, auto_affi->period); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_RESTART; +} + +static int tg_update_affinity_domain_down(struct task_group *tg, void *data) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int *cpu_state = data; + unsigned long flags; + int i; + + if (!auto_affi) + return 0; + + ad = &tg->auto_affinity->ad; + raw_spin_lock_irqsave(&auto_affi->lock, flags); + + for (i = 0; i < ad->dcount; i++) { + if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i])) + continue; + + /* online */ + if (cpu_state[1]) { + cpumask_set_cpu(cpu_state[0], ad->domains[i]); + } else { + cpumask_clear_cpu(cpu_state[0], ad->domains[i]); + if (!cpumask_weight(ad->domains[i])) + affinity_domain_up(tg); + } + + } + sched_grid_zone_update(false); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + + return 0; +} + +void tg_update_affinity_domains(int cpu, int online) +{ + int cpu_state[2]; + + cpu_state[0] = cpu; + cpu_state[1] = online; + + rcu_read_lock(); + walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state); + rcu_read_unlock(); +} + +void start_auto_affinity(struct auto_affinity *auto_affi) +{ + ktime_t delay_ms; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 1) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + + auto_affi->period_active = 1; + auto_affi->mode = 1; + delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); + hrtimer_forward_now(&auto_affi->period_timer, delay_ms); + hrtimer_start_expires(&auto_affi->period_timer, + HRTIMER_MODE_ABS_PINNED); + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_inc(); + mutex_unlock(&smart_grid_used_mutex); +} + +void stop_auto_affinity(struct auto_affinity *auto_affi) +{ + struct affinity_domain *ad = &auto_affi->ad; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 0) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + + hrtimer_cancel(&auto_affi->period_timer); + auto_affi->period_active = 0; + auto_affi->mode = 0; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_dec(); + sched_grid_zone_update(false); + mutex_unlock(&smart_grid_used_mutex); +} + +static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + unsigned long load, avg_load, runnable_load; + int i; + + avg_load = 0; + runnable_load = 0; + + for_each_cpu(i, sched_group_span(group)) { + load = cpu_runnable(cpu_rq(i)); + runnable_load += load; + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + } + + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + + if (min_runnable_load > (runnable_load + imbalance)) { + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + min_avg_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + return idlest ? idlest : group; +} + +static int group_find_idlest_cpu(struct sched_group *group) +{ + int least_loaded_cpu = cpumask_first(sched_group_span(group)); + unsigned long load, min_load = ULONG_MAX; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int shallowest_idle_cpu = -1; + int i; + + if (group->group_weight == 1) + return least_loaded_cpu; + + for_each_cpu(i, sched_group_span(group)) { + if (sched_idle_cpu(i)) + return i; + + if (available_idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + + if (idle && idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || + idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = cpu_runnable(cpu_rq(i)); + if (load < min_load) { + min_load = load; + least_loaded_cpu = i; + } + } + } + + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : + least_loaded_cpu; +} + +void free_affinity_domains(struct affinity_domain *ad) +{ + int i; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + kfree(ad->domains[i]); + kfree(ad->domains_orig[i]); + ad->domains[i] = NULL; + ad->domains_orig[i] = NULL; + } + ad->dcount = 0; +} + +static int init_affinity_domains_orig(struct affinity_domain *ad) +{ + int i, j; + + for (i = 0; i < ad->dcount; i++) { + ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains_orig[i]) + goto err; + + cpumask_copy(ad->domains_orig[i], ad->domains[i]); + } + + return 0; +err: + for (j = 0; j < i; j++) { + kfree(ad->domains_orig[j]); + ad->domains_orig[j] = NULL; + } + return -ENOMEM; +} + +static int init_affinity_domains(struct affinity_domain *ad) +{ + struct sched_domain *sd = NULL, *tmp; + struct sched_group *idlest = NULL; + int ret = -ENOMEM; + int dcount = 0; + int i = 0; + int cpu; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains[i]) + goto err; + } + + rcu_read_lock(); + cpu = cpumask_first_and(cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); + for_each_domain(cpu, tmp) { + sd = tmp; + dcount++; + } + + if (!sd || dcount > AD_LEVEL_MAX) { + rcu_read_unlock(); + ret = -EINVAL; + goto err; + } + + idlest = sd_find_idlest_group(sd); + cpu = group_find_idlest_cpu(idlest); + i = 0; + for_each_domain(cpu, tmp) { + cpumask_copy(ad->domains[i], sched_domain_span(tmp)); + __schedstat_set(ad->stay_cnt[i], 0); + i++; + } + rcu_read_unlock(); + + ad->dcount = dcount; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + ad->domain_mask = (1 << ad->dcount) - 1; + + ret = init_affinity_domains_orig(ad); + if (ret) + goto err; + + return 0; +err: + free_affinity_domains(ad); + return ret; +} + +int init_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi; + int ret; + + auto_affi = kzalloc(sizeof(*auto_affi), GFP_KERNEL); + if (!auto_affi) + return -ENOMEM; + + raw_spin_lock_init(&auto_affi->lock); + auto_affi->mode = 0; + auto_affi->period_active = 0; + auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); + hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + auto_affi->period_timer.function = sched_auto_affi_period_timer; + + ret = init_affinity_domains(&auto_affi->ad); + if (ret) { + kfree(auto_affi); + if (ret == -EINVAL) + ret = 0; + return ret; + } + + auto_affi->tg = tg; + tg->auto_affinity = auto_affi; + INIT_LIST_HEAD(&auto_affi->af_list); + sched_grid_zone_add_af(auto_affi); + return 0; +} + +static void destroy_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (unlikely(!auto_affi)) + return; + + if (auto_affi->period_active) + smart_grid_usage_dec(); + + hrtimer_cancel(&auto_affi->period_timer); + sched_grid_zone_del_af(auto_affi); + free_affinity_domains(&auto_affi->ad); + + kfree(tg->auto_affinity); + tg->auto_affinity = NULL; +} +#else +static void destroy_auto_affinity(struct task_group *tg) {} + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static inline bool prefer_cpus_valid(struct task_struct *p); + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + return p->prefer_cpus; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + return 0; +} +#endif +#endif + /************************************************** * CFS operations on tasks: */ @@ -7307,10 +7762,11 @@ int sysctl_sched_util_low_pct = 85; static inline bool prefer_cpus_valid(struct task_struct *p) { - return p->prefer_cpus && - !cpumask_empty(p->prefer_cpus) && - !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && - cpumask_subset(p->prefer_cpus, p->cpus_ptr); + struct cpumask *prefer_cpus = task_prefer_cpus(p); + + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); } static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -7327,7 +7783,7 @@ static inline unsigned long taskgroup_cpu_util(struct task_group *tg, /* * set_task_select_cpus: select the cpu range for task * @p: the task whose available cpu range will to set - * @idlest_cpu: the cpu which is the idlest in prefer cpus + *uto_affinity_used @idlest_cpu: the cpu which is the idlest in prefer cpus * * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select @@ -7345,13 +7801,23 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, long min_util = INT_MIN; struct task_group *tg; long spare; - int cpu; + int cpu, mode; - p->select_cpus = p->cpus_ptr; - if (!prefer_cpus_valid(p)) + rcu_read_lock(); + mode = dynamic_affinity_mode(p); + if (mode == -1) { + rcu_read_unlock(); + return; + } else if (mode == 1) { + p->select_cpus = task_prefer_cpus(p); + if (idlest_cpu) + *idlest_cpu = cpumask_first(p->select_cpus); + sched_qos_affinity_set(p); + rcu_read_unlock(); return; + } - rcu_read_lock(); + /* manual mode */ tg = task_group(p); for_each_cpu(cpu, p->prefer_cpus) { if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) { @@ -7419,13 +7885,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f time = schedstat_start_time(); /* - * required for stable ->cpus_allowed + * required for stable ->cpus_ptr */ lockdep_assert_held(&p->pi_lock); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY p->select_cpus = p->cpus_ptr; - if (dynamic_affinity_used()) + if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, &idlest_cpu, sd_flag); #endif @@ -9016,7 +9482,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY p->select_cpus = p->cpus_ptr; - if (dynamic_affinity_used()) + if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, NULL, 0); if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { #else @@ -12926,6 +13392,7 @@ void free_fair_sched_group(struct task_group *tg) int i; destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + destroy_auto_affinity(tg); for_each_possible_cpu(i) { #ifdef CONFIG_QOS_SCHED @@ -12949,7 +13416,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - int i; + int i, ret; tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) @@ -12961,6 +13428,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + ret = init_auto_affinity(tg); + if (ret) + goto err; for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -12983,6 +13453,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) err_free_rq: kfree(cfs_rq); err: + destroy_auto_affinity(tg); return 0; } diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..82f2a09c3c309e4185370d7eaf572f8a4ec2ab0f --- /dev/null +++ b/kernel/sched/grid/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += qos.o power.o stat.o diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..743f72aaffbfc27a157b232cc00b303c566099d6 --- /dev/null +++ b/kernel/sched/grid/internal.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H +#define _LINUX_SCHED_SMART_GRID_INTERNAL_H +void qos_power_init(struct sched_grid_qos_power *power); +void qos_stat_init(struct sched_grid_qos_stat *stat); +#endif diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c new file mode 100644 index 0000000000000000000000000000000000000000..f916cd3801ad73964f286e9997191848467c98ab --- /dev/null +++ b/kernel/sched/grid/power.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for QOS-aware smart grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include "internal.h" + +void qos_power_init(struct sched_grid_qos_power *power) +{ + power->cpufreq_sense_ratio = 0; + power->target_cpufreq = 0; + power->cstate_sense_ratio = 0; +} diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c new file mode 100644 index 0000000000000000000000000000000000000000..981ee0c828f2b343288f09d2dbb588da480582f3 --- /dev/null +++ b/kernel/sched/grid/qos.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for Smart Grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include <../kernel/sched/sched.h> + +static inline int qos_affinity_set(struct task_struct *p) +{ + int n; + struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity; + + if (likely(affinity->prefer_cpus == p->select_cpus)) + return 0; + + /* + * We want the memory allocation to be as close to the CPU + * as possible, and adjust after getting memory bandwidth usage. + */ + for (n = 0; n < nr_node_ids; n++) { + if (cpumask_intersects(cpumask_of_node(n), p->select_cpus)) + node_set(n, affinity->mem_preferred_node_mask); + else + node_clear(n, affinity->mem_preferred_node_mask); + } + + affinity->prefer_cpus = p->select_cpus; + return 0; +} + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig) +{ + struct sched_grid_qos *qos; + + qos = kzalloc(sizeof(*qos), GFP_KERNEL); + if (!qos) + return -ENOMEM; + + qos_power_init(&qos->power); + qos_stat_init(&qos->stat); + + nodes_clear(qos->affinity.mem_preferred_node_mask); + if (likely(orig->grid_qos)) + qos->affinity = orig->grid_qos->affinity; + qos->affinity_set = qos_affinity_set; + p->grid_qos = qos; + + return 0; +} + +void sched_grid_qos_free(struct task_struct *p) +{ + kfree(p->grid_qos); + p->grid_qos = NULL; +} + +/* dynamic select a more appropriate preferred interleave nid for process */ +int sched_grid_preferred_interleave_nid(struct mempolicy *policy) +{ +#ifndef CONFIG_NUMA + return NUMA_NO_NODE; +#else + nodemask_t nmask; + unsigned int next; + struct task_struct *me = current; + nodemask_t *preferred_nmask = NULL; + + if (likely(me->grid_qos)) + preferred_nmask = + &me->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask || !policy) + return NUMA_NO_NODE; + + if (nodes_equal(policy->v.nodes, *preferred_nmask)) + return NUMA_NO_NODE; + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred interleave nid in + * more appropriate range. + */ + nodes_and(nmask, policy->v.nodes, *preferred_nmask); + if (nodes_empty(nmask)) + return NUMA_NO_NODE; + + next = next_node_in(me->il_prev, nmask); + if (next < MAX_NUMNODES) + me->il_prev = next; + return next; +#endif +} + +/* dynamic select a more appropriate preferred nid for process */ +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) +{ + int nd = preferred_nid; + nodemask_t nmask, ndmask; + nodemask_t *preferred_nmask = NULL; + + if (likely(current->grid_qos)) + preferred_nmask = + ¤t->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask) + return preferred_nid; + + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred nid in more appropriate + * range. + */ + nmask = *preferred_nmask; + if (nodemask) { + if (nodes_equal(*nodemask, nmask)) + return preferred_nid; + + nodes_and(nmask, nmask, *nodemask); + } + + if (node_isset(preferred_nid, nmask)) + return preferred_nid; + + /* + * We prefer the numa node we're running, if there is no limit + * to nodemask, we select preferred nid in preferred range or + * in restriced range if not. + */ + init_nodemask_of_node(&ndmask, numa_node_id()); + nodes_and(ndmask, nmask, ndmask); + if (!nodes_empty(ndmask)) + nd = first_node(ndmask); + else if (!nodes_empty(nmask)) + nd = first_node(nmask); + + return nd; +} + +static struct sched_grid_zone sg_zone; + +int __init sched_grid_zone_init(void) +{ + int index; + + for (index = 0; index < SMART_GRID_ZONE_NR; index++) + cpumask_clear(&sg_zone.cpus[index]); + + raw_spin_lock_init(&sg_zone.lock); + INIT_LIST_HEAD(&sg_zone.af_list_head); + return 0; +} + +int sched_grid_zone_update(bool is_locked) +{ + struct list_head *pos; + struct auto_affinity *af_pos; + unsigned long flags; + + if (!is_locked) + raw_spin_lock_irqsave(&sg_zone.lock, flags); + + cpumask_clear(&sg_zone.cpus[SMART_GRID_ZONE_HOT]); + + list_for_each(pos, &sg_zone.af_list_head) { + af_pos = list_entry(pos, struct auto_affinity, af_list); + + /* when smart_grid not used we need calculate all task_group */ + /* when smart_grid used we only calculate enabled task_group */ + if (smart_grid_used() && af_pos->mode == 0) + continue; + + cpumask_or(&sg_zone.cpus[SMART_GRID_ZONE_HOT], &sg_zone.cpus[SMART_GRID_ZONE_HOT], + af_pos->ad.domains[af_pos->ad.curr_level]); + } + + cpumask_complement(&sg_zone.cpus[SMART_GRID_ZONE_WARM], + &sg_zone.cpus[SMART_GRID_ZONE_HOT]); + + if (!is_locked) + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + + cpufreq_smart_grid_start_sync(); + return 0; +} + +int sched_grid_zone_add_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_add_tail(&af->af_list, &sg_zone.af_list_head); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +int sched_grid_zone_del_af(struct auto_affinity *af) +{ + unsigned long flags; + + if (af == NULL) + return -1; + + raw_spin_lock_irqsave(&sg_zone.lock, flags); + list_del(&af->af_list); + sched_grid_zone_update(true); + raw_spin_unlock_irqrestore(&sg_zone.lock, flags); + return 0; +} + +struct cpumask *sched_grid_zone_cpumask(enum sg_zone_type zone) +{ + if (zone >= SMART_GRID_ZONE_NR) + return NULL; + + return &sg_zone.cpus[zone]; +} + +/* + * Default smart_grid strategy was disable (=0). + * But, considering for inheritance of the pre-verion code. + * We make all the task to the highest qos_level (class_lvl = 0), + * when smart_grid strategy was disabled. + * Otherwise, When smart_grid strategy was enabled, we use the task's + * actually class_lvl. + */ +unsigned int sysctl_smart_grid_strategy_ctrl; + +struct cpumask *sched_grid_prefer_cpus(struct task_struct *p) +{ + enum sg_zone_type current_zone; + + /* + * when smart_grid strategy was disabled, + * We make all the task to the highest qos_level (class_lvl = 0) + */ + if (sysctl_smart_grid_strategy_ctrl == 0) { + current_zone = SMART_GRID_ZONE_HOT; + } else { + /* Only place highest level task into hot zone */ + current_zone = p->grid_qos->stat.class_lvl == SCHED_GRID_QOS_TASK_LEVEL_HIGHEST ? + SMART_GRID_ZONE_HOT : SMART_GRID_ZONE_WARM; + } + + return &sg_zone.cpus[current_zone]; +} diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c new file mode 100644 index 0000000000000000000000000000000000000000..68bbc060b8110e17381ade5b8cf9d2c340360945 --- /dev/null +++ b/kernel/sched/grid/stat.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for QOS-aware smart grid Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Wang Shaobo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include +#include "internal.h" + +static int qos_stat_set_class_level(struct sched_grid_qos_stat *qos_stat, int level) +{ + if (qos_stat == NULL || level >= SCHED_GRID_QOS_TASK_LEVEL_MAX) + return -EINVAL; + + qos_stat->class_lvl = level; + return 0; +} + +void qos_stat_init(struct sched_grid_qos_stat *stat) +{ + if (stat == NULL) + return; + + stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips"; + stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio"; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index = + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width"; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index = + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX; + + stat->set_class_lvl = qos_stat_set_class_level; + stat->class_lvl = SCHED_GRID_QOS_TASK_LEVEL_DEFAULT; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91ae933c20be27131059570f5003a6fcaf458d31..bdf1a9b1dda4b1799d9ce9c1a15b457a1c61b741 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -400,6 +400,35 @@ struct cfs_bandwidth { #endif }; + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AD_LEVEL_MAX 8 + +struct affinity_domain { + int dcount; + int curr_level; + u32 domain_mask; +#ifdef CONFIG_SCHEDSTATS + u64 stay_cnt[AD_LEVEL_MAX]; +#endif + struct cpumask *domains[AD_LEVEL_MAX]; + struct cpumask *domains_orig[AD_LEVEL_MAX]; +}; +#endif + +struct auto_affinity { +#ifdef CONFIG_QOS_SCHED_SMART_GRID + raw_spinlock_t lock; + u64 mode; + ktime_t period; + struct hrtimer period_timer; + int period_active; + struct affinity_domain ad; + struct task_group *tg; + struct list_head af_list; +#endif +}; + /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -471,7 +500,11 @@ struct task_group { #else KABI_RESERVE(3) #endif +#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) + KABI_USE(4, struct auto_affinity *auto_affinity) +#else KABI_RESERVE(4) +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -542,6 +575,21 @@ extern void sched_offline_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern void start_auto_affinity(struct auto_affinity *auto_affi); +extern void stop_auto_affinity(struct auto_affinity *auto_affi); +extern int init_auto_affinity(struct task_group *tg); +extern void tg_update_affinity_domains(int cpu, int online); + +#else +static inline int init_auto_affinity(struct task_group *tg) +{ + return 0; +} + +static inline void tg_update_affinity_domains(int cpu, int online) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index edb80160491fdb5dba27701ae75c275c2586deaf..f3f43b2def7f243653d73bda4d5cec29d0c44bbf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,7 +125,7 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif -#ifdef CONFIG_QOS_SCHED +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_QOS_SCHED_SMART_GRID) static int hundred_thousand = 100000; #endif #ifdef CONFIG_PERF_EVENTS @@ -2802,6 +2802,26 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .procname = "smart_grid_strategy_ctrl", + .data = &sysctl_smart_grid_strategy_ctrl, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "affinity_adjust_delay_ms", + .data = &sysctl_affinity_adjust_delay_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &hundred_thousand, + }, #endif { } }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f83491d21656d553f19d1fab07127f05f58bc27e..659c6f0d146e777e6278a04941aa48d81a67ae6a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -2373,7 +2374,14 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + if (smart_grid_used()) { + nid = sched_grid_preferred_interleave_nid(pol); + nid = (nid == NUMA_NO_NODE) ? + interleave_nid(pol, vma, addr, PAGE_SHIFT + order) : nid; + } else { + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + } + mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); goto out; @@ -2427,6 +2435,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); + if (smart_grid_used()) + preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask); page = __alloc_pages(gfp, order, preferred_nid, nmask); mark_vma_cdm(nmask, page, vma); mpol_cond_put(pol);