diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index eb30ef59aca296a7cb2e581c476f0eddc444173c..4ba485650d0a2d132f1e4ceb22de895aa91266f7 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -80,6 +80,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE select ARCH_SUPPORTS_SCHED_PARAL + select ARCH_SUPPORTS_SCHED_SOFT_QUOTA select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index be1faf2da0081c68a98b174e774930268ce26833..1e1e70a6736d590b10921b643787f65c736ff09e 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -191,6 +191,7 @@ CONFIG_NET_NS=y CONFIG_SCHED_STEAL=y CONFIG_SCHED_KEEP_ON_CORE=y CONFIG_SCHED_PARAL=y +CONFIG_SCHED_SOFT_QUOTA=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y # CONFIG_SYSFS_DEPRECATED is not set diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 785de5b9696d742442d3bc95e0ca8f588155c154..b3ae5c6de81e3dd4e034223ff359c6d47f457b1f 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -363,6 +363,38 @@ void topology_scale_freq_tick(void) this_cpu_write(arch_const_cycles_prev, const_cnt); } +#ifdef CONFIG_SCHED_SOFT_QUOTA +static DEFINE_PER_CPU(int, sibling_idle) = 1; + +int is_sibling_idle(void) +{ + return this_cpu_read(sibling_idle); +} + +static void smt_measurement_begin(void) +{ + // TODO +} + +static void smt_measurement_done(void) +{ + // TODO +} +#else +static inline void smt_measurement_begin(void) { } +static inline void smt_measurement_done(void) { } +#endif + +void arch_cpu_idle_enter(void) +{ + smt_measurement_begin(); +} + +void arch_cpu_idle_exit(void) +{ + smt_measurement_done(); +} + #ifdef CONFIG_ACPI_CPPC_LIB #include diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 9f998be56bdd888583c719494d34e06f40dcedc4..90021477ea4ca063385a70c15dfc9cacbd5387ae 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,6 +48,10 @@ extern unsigned int sysctl_smart_grid_strategy_ctrl; extern int sysctl_affinity_adjust_delay_ms; #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA +extern unsigned int sysctl_soft_runtime_ratio; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index 5f88cce193e834bc7f47dd581ad480a771b1463b..2ee50c638ca3595ef0a963a58db48144d164de2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1411,6 +1411,24 @@ config SCHED_PARAL 3. The existing "qos dynamic affinity" and "qos smart grid" features must not be used simultaneously. +# +# For architectures that want to enable the support for SCHED_SOFT_QUOTA +# +config ARCH_SUPPORTS_SCHED_SOFT_QUOTA + bool + +config SCHED_SOFT_QUOTA + bool "More flexible use of CPU quota" + depends on ARCH_SUPPORTS_SCHED_SOFT_QUOTA + depends on CFS_BANDWIDTH + default n + help + This option allows users to use CPU quota more flexibly when CPU + is idle. It is better for users to have some understanding of + CFS_BANDWIDTH. It cannot be used in scenarios where there are strict + restrictions on the use of the CPU quota, such as some commercial + scenarios that charge based on the use of CPU quota. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 457eeebc7b62fafff468797b223fb5b15b445579..72cb2c1adb7ba115d5d83680c7b36f9560d47840 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9902,6 +9902,30 @@ static int cpu_steal_task_write(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA +static int cpu_soft_quota_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 soft_quota) +{ + struct task_group *tg = css_tg(css); + + if (soft_quota != 1 && soft_quota != 0) + return -EINVAL; + + if (tg->soft_quota == soft_quota) + return 0; + + tg->soft_quota = soft_quota; + + return 0; +} + +static inline s64 cpu_soft_quota_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->soft_quota; +} +#endif + #ifdef CONFIG_BPF_SCHED void sched_settag(struct task_struct *tsk, s64 tag) { @@ -10064,6 +10088,14 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_qos_write, }, #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA + { + .name = "soft_quota", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_soft_quota_read, + .write_s64 = cpu_soft_quota_write, + }, +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER { .name = "smt_expell", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7544a14225c90bebe2f771b35d8ea16e33ff68e..f601a6ea031e0324028eb652e1ac3c7b33c7b211 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -140,6 +140,10 @@ static int unthrottle_qos_cfs_rqs(int cpu); static bool qos_smt_expelled(int this_cpu); #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA +static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, soft_quota_throttled_cfs_rq); +#endif + #ifdef CONFIG_QOS_SCHED_MULTILEVEL #define QOS_LEVEL_WEIGHT_OFFLINE_EX 1 #define QOS_LEVEL_WEIGHT_OFFLINE 10 @@ -439,10 +443,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -static void +static bool find_matching_se(struct sched_entity **se, struct sched_entity **pse) { int se_depth, pse_depth; + bool ret = false; /* * preemption test can be made between sibling entities who are in the @@ -456,6 +461,10 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) pse_depth = (*pse)->depth; while (se_depth > pse_depth) { +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1) + ret = true; +#endif se_depth--; *se = parent_entity(*se); } @@ -466,9 +475,15 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } while (!is_same_group(*se, *pse)) { +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1) + ret = true; +#endif *se = parent_entity(*se); *pse = parent_entity(*pse); } + + return ret; } #else /* !CONFIG_FAIR_GROUP_SCHED */ @@ -503,9 +518,10 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return NULL; } -static inline void +static inline bool find_matching_se(struct sched_entity **se, struct sched_entity **pse) { + return false; } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -5396,6 +5412,14 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); + +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (cfs_rq->tg->soft_quota == 1) { + list_add(&cfs_rq->soft_quota_throttled_list, + &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq))); + } +#endif + return true; } @@ -5414,6 +5438,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq)]; +#ifdef CONFIG_SCHED_SOFT_QUOTA + list_del_init(&cfs_rq->soft_quota_throttled_list); +#endif + #ifdef CONFIG_QOS_SCHED /* * if this cfs_rq throttled by qos, not need unthrottle it. @@ -5531,6 +5559,16 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) struct rq_flags rf; rq_lock_irqsave(rq, &rf); + +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (cfs_rq->soft_quota_enable == 1) { + if (cfs_rq->runtime_remaining > 0) + cfs_rq->runtime_remaining = 0; + + cfs_rq->soft_quota_enable = 0; + } +#endif + if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -5573,6 +5611,17 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) rcu_read_unlock(); } +#ifdef CONFIG_SCHED_SOFT_QUOTA +static inline void init_tg_sum_soft_runtime(struct cfs_bandwidth *cfs_b) +{ + unsigned int cpu; + struct task_group *tg = container_of(cfs_b, struct task_group, cfs_bandwidth); + + for_each_possible_cpu(cpu) + tg->cfs_rq[cpu]->sum_soft_runtime = 0; +} +#endif + /* * Responsible for refilling a task_group's bandwidth and unthrottling its * cfs_rqs as appropriate. If there has been no activity within the last @@ -5590,6 +5639,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; +#ifdef CONFIG_SCHED_SOFT_QUOTA + init_tg_sum_soft_runtime(cfs_b); +#endif + /* Refill extra burst quota even if cfs_b->idle */ __refill_cfs_bandwidth_runtime(cfs_b); @@ -5898,6 +5951,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED INIT_LIST_HEAD(&cfs_rq->qos_throttled_list); #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA + INIT_LIST_HEAD(&cfs_rq->soft_quota_throttled_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -8536,6 +8592,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + bool ret = 0; if (unlikely(se == pse)) return; @@ -8590,7 +8647,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; - find_matching_se(&se, &pse); + ret = find_matching_se(&se, &pse); + +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (ret) + goto preempt; +#endif + update_curr(cfs_rq_of(se)); BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { @@ -13823,6 +13886,9 @@ static void task_change_group_fair(struct task_struct *p, int type) void free_fair_sched_group(struct task_group *tg) { int i; +#ifdef CONFIG_SCHED_SOFT_QUOTA + struct cfs_rq *cfs_rq; +#endif destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); destroy_auto_affinity(tg); @@ -13831,6 +13897,12 @@ void free_fair_sched_group(struct task_group *tg) #ifdef CONFIG_QOS_SCHED if (tg->cfs_rq && tg->cfs_rq[i]) unthrottle_qos_sched_group(tg->cfs_rq[i]); +#endif +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (tg->cfs_rq && tg->cfs_rq[i]) { + cfs_rq = tg->cfs_rq[i]; + list_del_init(&cfs_rq->soft_quota_throttled_list); + } #endif if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -14209,13 +14281,20 @@ void task_tick_relationship(struct rq *rq, struct task_struct *curr) __init void init_sched_fair_class(void) { -#ifdef CONFIG_QOS_SCHED +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_SCHED_SOFT_QUOTA) int i; +#endif +#ifdef CONFIG_QOS_SCHED for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(soft_quota_throttled_cfs_rq, i)); +#endif + init_sched_numa_icon(); #ifdef CONFIG_SMP @@ -14327,3 +14406,59 @@ int sched_trace_rq_nr_running(struct rq *rq) return rq ? rq->nr_running : -1; } EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); + +#ifdef CONFIG_SCHED_SOFT_QUOTA +unsigned int sysctl_soft_runtime_ratio = 20; +static bool check_soft_runtime(struct task_group *tg, int slice) +{ + int cpu; + u64 sum_soft_runtime = slice; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + if (cfs_b->quota == RUNTIME_INF) + return true; + + for_each_possible_cpu(cpu) + sum_soft_runtime += tg->cfs_rq[cpu]->sum_soft_runtime; + + return sum_soft_runtime < sysctl_soft_runtime_ratio * cfs_b->quota / 100; +} + +bool unthrottle_cfs_rq_soft_quota(struct rq *rq) +{ + int max_cnt = 0; + bool ret = false; + struct cfs_rq *cfs_rq, *tmp_rq; + struct cfs_bandwidth *cfs_b; + int slice = sched_cfs_bandwidth_slice(); + + list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq)), + soft_quota_throttled_list) { + if (max_cnt++ > 20) + break; + + if (cfs_rq->throttled) { + cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + raw_spin_lock(&cfs_b->lock); + + if (!check_soft_runtime(cfs_rq->tg, slice)) { + raw_spin_unlock(&cfs_b->lock); + continue; + } + + raw_spin_unlock(&cfs_b->lock); + + if (cfs_rq->runtime_remaining + slice > 0) { + cfs_rq->runtime_remaining += slice; + cfs_rq->sum_soft_runtime += slice; + cfs_rq->soft_quota_enable = 1; + unthrottle_cfs_rq(cfs_rq); + ret = true; + break; + } + } + } + + return ret; +} +#endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 3c6396d61a041e2749c200e8bd5cee614b09e5b0..4e5f603219b4755733546dbe2d56758352b5e1b6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -439,10 +439,23 @@ static struct task_struct *pick_task_idle(struct rq *rq) } #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA +int __weak is_sibling_idle(void) +{ + return 0; +} +#endif + struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; +#ifdef CONFIG_SCHED_SOFT_QUOTA + if (unthrottle_cfs_rq_soft_quota(rq) && rq->cfs.nr_running && + is_sibling_idle()) + return pick_next_task_fair(rq, NULL, NULL); +#endif + set_next_task_idle(rq, next, true); return next; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe6342305b0f3ca5efef5ee19dfcf7fe032e2d83..9b2779e8fc912ce903b19e6336ddf6f72b6401dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -508,6 +508,9 @@ struct task_group { #else KABI_RESERVE(4) #endif +#if defined(CONFIG_SCHED_SOFT_QUOTA) + KABI_EXTEND(u64 soft_quota) +#endif }; #ifdef CONFIG_SCHED_STEAL @@ -606,6 +609,10 @@ static inline int init_auto_affinity(struct task_group *tg) static inline void tg_update_affinity_domains(int cpu, int online) {} #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA +extern bool unthrottle_cfs_rq_soft_quota(struct rq *rq); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -734,6 +741,11 @@ struct cfs_rq { KABI_RESERVE(3) KABI_RESERVE(4) #endif +#if defined(CONFIG_SCHED_SOFT_QUOTA) + KABI_EXTEND(u64 soft_quota_enable) + KABI_EXTEND(u64 sum_soft_runtime) + KABI_EXTEND(struct list_head soft_quota_throttled_list) +#endif }; static inline int rt_bandwidth_enabled(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0b1c13a053326d42686d02d558eed6b5b669efe3..738d9a4455c18f5f6f1833d16b9e338fca847e81 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2831,6 +2831,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one_hundred, }, #endif +#ifdef CONFIG_SCHED_SOFT_QUOTA + { + .procname = "sched_soft_runtime_ratio", + .data = &sysctl_soft_runtime_ratio, + .maxlen = sizeof(sysctl_soft_runtime_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_SCHED_STEAL { .procname = "sched_max_steal_count",