diff --git a/fs/proc/base.c b/fs/proc/base.c index 2b318f2161f04dc9776ed0e8b848c965dd281c2e..835d83c1af86910e493910abd2d30066f1bfa935 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -1559,6 +1560,70 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_SCHED_WALT +static int sched_init_task_load_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + seq_printf(m, "%d\n", sched_get_init_task_load(p)); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int init_task_load, err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &init_task_load); + if (err) + goto out; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = sched_set_init_task_load(p, init_task_load); + + put_task_struct(p); + +out: + return err < 0 ? err : count; +} + +static int sched_init_task_load_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_init_task_load_show, inode); +} + +static const struct file_operations proc_pid_sched_init_task_load_operations = { + .open = sched_init_task_load_open, + .read = seq_read, + .write = sched_init_task_load_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { @@ -3261,6 +3326,9 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_WALT + REG("sched_init_task_load", 00644, proc_pid_sched_init_task_load_operations), +#endif #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index d3198acc29af318b39ef85a111161ab0c99238b0..89869b515c64c0fb797a3ca8c55ee8e28afa1cd0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,6 +291,14 @@ struct user_event_mm; enum { TASK_COMM_LEN = 16, }; +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; extern void scheduler_tick(void); @@ -590,6 +598,53 @@ struct sched_entity { #endif }; +#ifdef CONFIG_SCHED_WALT +extern void sched_exit(struct task_struct *p); +extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct); +extern u32 sched_get_init_task_load(struct task_struct *p); +extern void free_task_load_ptrs(struct task_struct *p); +#define RAVG_HIST_SIZE_MAX 5 +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 *curr_window_cpu, *prev_window_cpu; + u32 curr_window, prev_window; + u16 active_windows; + u16 demand_scaled; +}; +#else +static inline void sched_exit(struct task_struct *p) { } +static inline void free_task_load_ptrs(struct task_struct *p) { } +#endif /* CONFIG_SCHED_WALT */ + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -800,6 +855,15 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; + u64 last_sleep_ts; +#endif struct sched_dl_entity dl; const struct sched_class *sched_class; diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index bdd31ab93bc51dec77f73a5496160cac0083cfac..ee97c8e891c44f68f80aa379957ec358314d06dd 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,6 +9,8 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_WALT (1U << 1) +#define SCHED_CPUFREQ_CONTINUE (1U << 2) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy; diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 0108a38bb64d75389ff7f0a7f68843487bc03646..0b2189197aca1bb7b146e017db70d5cc11d135a0 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -22,6 +22,14 @@ extern bool single_task_running(void); extern unsigned int nr_iowait(void); extern unsigned int nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_get_cpu_util(int cpu); +#else +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +#endif static inline int sched_info_on(void) { return IS_ENABLED(CONFIG_SCHED_INFO); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 980adc56cfe7553c314881cdc7a69dabb413ac69..0e098c5a0ff323eabdd19f47e062d1b0b6fb8621 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,7 +14,16 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_wakeup_granularity; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_cpu_high_irqload; +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/include/trace/events/walt.h b/include/trace/events/walt.h new file mode 100755 index 0000000000000000000000000000000000000000..9af92c8689b986bf51405ba3fa0d0000204a24a4 --- /dev/null +++ b/include/trace/events/walt.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM walt + +#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WALT_H + +#include +#include + +struct rq; +extern const char *task_event_names[]; + +#if defined(CREATE_TRACE_POINTS) && defined(CONFIG_SCHED_WALT) +static inline void __window_data(u32 *dst, u32 *src) +{ + if (src) + memcpy(dst, src, nr_cpu_ids * sizeof(u32)); + else + memset(dst, 0, nr_cpu_ids * sizeof(u32)); +} + +struct trace_seq; +const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + seq_buf_used(&p->seq); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%u ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} + +static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->nt_curr_runnable_sum; + else + return rq->curr_runnable_sum; + else + if (new) + return rq->nt_prev_runnable_sum; + else + return rq->prev_runnable_sum; +} + +#ifdef CONFIG_SCHED_RTG +static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new) +{ + if (curr) + if (new) + return rq->grp_time.nt_curr_runnable_sum; + else + return rq->grp_time.curr_runnable_sum; + else + if (new) + return rq->grp_time.nt_prev_runnable_sum; + else + return rq->grp_time.prev_runnable_sum; +} + +static inline s64 +__get_update_sum(struct rq *rq, enum migrate_types migrate_type, + bool src, bool new, bool curr) +{ + switch (migrate_type) { + case RQ_TO_GROUP: + if (src) + return __rq_update_sum(rq, curr, new); + else + return __grp_update_sum(rq, curr, new); + case GROUP_TO_RQ: + if (src) + return __grp_update_sum(rq, curr, new); + else + return __rq_update_sum(rq, curr, new); + default: + WARN_ON_ONCE(1); + return -1; + } +} +#endif +#endif + +TRACE_EVENT(sched_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + enum task_event evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, runtime) + __field(int, samples) + __field(enum task_event, evt) + __field(unsigned int, demand) + __array(u32, hist, RAVG_HIST_SIZE_MAX) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %s demand %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, + task_event_names[__entry->evt], __entry->demand, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(sched_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(pid_t, cur_pid) + __field(unsigned int, cur_freq) + __field(u64, wallclock) + __field(u64, mark_start) + __field(u64, delta_m) + __field(u64, win_start) + __field(u64, delta) + __field(u64, irqtime) + __field(enum task_event, evt) + __field(unsigned int, demand) + __field(unsigned int, sum) + __field(int, cpu) + __field(u64, rq_cs) + __field(u64, rq_ps) + __field(u32, curr_window) + __field(u32, prev_window) + __dynamic_array(u32, curr_sum, nr_cpu_ids) + __dynamic_array(u32, prev_sum, nr_cpu_ids) + __field(u64, nt_cs) + __field(u64, nt_ps) + __field(u32, active_windows) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cluster->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->rq_cs = rq->curr_runnable_sum; + __entry->rq_ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __window_data(__get_dynamic_array(curr_sum), p->ravg.curr_window_cpu); + __window_data(__get_dynamic_array(prev_sum), p->ravg.prev_window_cpu); + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_wins %u", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids), + __entry->prev_window, + __window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids), + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows) +); + +extern const char *migrate_type_names[]; + +#ifdef CONFIG_SCHED_RTG +TRACE_EVENT(sched_migration_update_sum, + + TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, struct rq *rq), + + TP_ARGS(p, migrate_type, rq), + + TP_STRUCT__entry( + __field(int, tcpu) + __field(int, pid) + __field(enum migrate_types, migrate_type) + __field(s64, src_cs) + __field(s64, src_ps) + __field(s64, dst_cs) + __field(s64, dst_ps) + __field(s64, src_nt_cs) + __field(s64, src_nt_ps) + __field(s64, dst_nt_cs) + __field(s64, dst_nt_ps) + ), + + TP_fast_assign( + __entry->tcpu = task_cpu(p); + __entry->pid = p->pid; + __entry->migrate_type = migrate_type; + __entry->src_cs = __get_update_sum(rq, migrate_type, + true, false, true); + __entry->src_ps = __get_update_sum(rq, migrate_type, + true, false, false); + __entry->dst_cs = __get_update_sum(rq, migrate_type, + false, false, true); + __entry->dst_ps = __get_update_sum(rq, migrate_type, + false, false, false); + __entry->src_nt_cs = __get_update_sum(rq, migrate_type, + true, true, true); + __entry->src_nt_ps = __get_update_sum(rq, migrate_type, + true, true, false); + __entry->dst_nt_cs = __get_update_sum(rq, migrate_type, + false, true, true); + __entry->dst_nt_ps = __get_update_sum(rq, migrate_type, + false, true, false); + ), + + TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld", + __entry->pid, __entry->tcpu, migrate_type_names[__entry->migrate_type], + __entry->src_cs, __entry->src_ps, __entry->dst_cs, __entry->dst_ps, + __entry->src_nt_cs, __entry->src_nt_ps, __entry->dst_nt_cs, __entry->dst_nt_ps) +); +#endif +#endif /* _TRACE_WALT_H */ + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index 879f6dc1cb274a9f92a03601106cb1d1e1d35765..b12f17a062e8b5a6041b060ad1ce5f22f12d8092 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -565,6 +565,14 @@ config SCHED_THERMAL_PRESSURE This requires the architecture to implement arch_update_thermal_pressure() and arch_scale_thermal_pressure(). +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. config BSD_PROCESS_ACCT bool "BSD Process Accounting" diff --git a/kernel/exit.c b/kernel/exit.c index 21a59a6e1f2e8941a314116e06b5337e1ff986dc..20dbf5cc67883140b11e3f719f9074409ad8cb02 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -826,6 +826,7 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/fork.c b/kernel/fork.c index 486248a10b8b75497afdfe9cb3caca30450d3d7a..ead78df4cf987718cdb4f32cd57f5ade10e1de68 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2783,6 +2783,7 @@ __latent_entropy struct task_struct *copy_process( perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd4520ebc1a607734520ac342585a120..c46379af99d98f11f5fbba1e0686d3a6613fb5f1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -32,3 +32,5 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_WALT) += walt.o +obj-$(CONFIG_SCHED_RUNNING_AVG) += sched_avg.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 461c49c8db244abb42f7f7b7bd11b423a8cd0087..ecb69c975d4ba3a5c0c7402121e97a423aa58d3f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,6 +91,7 @@ #include "pelt.h" #include "smp.h" #include "stats.h" +#include "walt.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -2562,8 +2563,17 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, lockdep_assert_rq_held(rq); deactivate_task(rq, p, DEQUEUE_NOCLOCK); +#ifdef CONFIG_SCHED_WALT + double_lock_balance(rq, cpu_rq(new_cpu)); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); +#endif set_task_cpu(p, new_cpu); +#ifdef CONFIG_SCHED_WALT + double_rq_unlock(cpu_rq(new_cpu), rq); +#else rq_unlock(rq, rf); +#endif rq = cpu_rq(new_cpu); @@ -3435,6 +3445,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) rseq_migrate(p); sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -4206,6 +4217,26 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * accesses to the task state; see try_to_wake_up() and set_current_state(). */ +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +/* utility function to update walt signals at wakeup */ +static inline void walt_try_to_wake_up(struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + + rq_lock_irqsave(rq, &rf); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +#define walt_try_to_wake_up(a) {} +#endif +#endif + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -4333,6 +4364,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_acquire__after_ctrl_dep(); + walt_try_to_wake_up(p); /* * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq * == 0), which means we need to do an enqueue, change p->state to @@ -4774,6 +4806,7 @@ late_initcall(sched_core_sysctl_init); */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { + init_new_task_load(p); __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4926,6 +4959,8 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); + mark_task_starting(p); + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -5696,6 +5731,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + u64 wallclock; unsigned long thermal_pressure; u64 resched_latency; @@ -5706,6 +5742,9 @@ void scheduler_tick(void) rq_lock(rq, &rf); + set_window_start(rq); + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_rq_clock(rq); thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); @@ -6636,6 +6675,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) struct rq_flags rf; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -6715,11 +6755,18 @@ static void __sched notrace __schedule(unsigned int sched_mode) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); + wallclock = sched_ktime_clock(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; #endif if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif + update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6750,6 +6797,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); @@ -9827,6 +9875,11 @@ int sched_cpu_deactivate(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->__lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->__lock, flags); rq->calc_load_update = calc_load_update; update_max_interval(); @@ -9931,6 +9984,8 @@ void __init sched_init_smp(void) sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + update_cluster_topology(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); @@ -9991,6 +10046,8 @@ void __init sched_init(void) wait_bit_init(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -10095,6 +10152,7 @@ void __init sched_init(void) rq->wake_stamp = jiffies; rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + walt_sched_init_rq(rq); INIT_LIST_HEAD(&rq->cfs_tasks); @@ -10149,6 +10207,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -12187,3 +12246,48 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_WALT +/* + * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field + * + * Stop accounting (exiting) task's future cpu usage + * + * We need this so that reset_all_windows_stats() can function correctly. + * reset_all_window_stats() depends on do_each_thread/for_each_thread task + * iterators to reset *all* task's statistics. Exiting tasks however become + * invisible to those iterators. sched_exit() is called on a exiting task prior + * to being removed from task_list, which will let reset_all_window_stats() + * function correctly. + */ +void sched_exit(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + u64 wallclock; + +#ifdef CONFIG_SCHED_RTG + sched_set_group_id(p, 0); +#endif + + rq = task_rq_lock(p, &rf); + + /* rq->curr == p */ + wallclock = sched_ktime_clock(); + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + dequeue_task(rq, p, 0); + /* + * task's contribution is already removed from the + * cumulative window demand in dequeue. As the + * task's stats are reset, the next enqueue does + * not change the cumulative window demand. + */ + reset_task_stats(p); + p->ravg.mark_start = wallclock; + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &rf); + free_task_load_ptrs(p); +} +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100755 index 0000000000000000000000000000000000000000..eef1d69211782dd890a4aab03788421adf64e80f --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sched.h" +#include "walt.h" + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_isolated_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool enable; + int nrrun; + struct task_struct *core_ctl_thread; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool isolated_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + (idx)++) + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(struct cluster_data *state); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_core_ctl_thread(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->max_cpus); +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return sysfs_emit(buf, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += sysfs_emit_at(buf, count, + "CPU%u\n", cpu); + count += sysfs_emit_at(buf, count, + "\tCPU: %u\n", c->cpu); + count += sysfs_emit_at(buf, count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tIsolated: %u\n", + cpu_isolated(c->cpu)); + count += sysfs_emit_at(buf, count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += sysfs_emit_at(buf, count, + "\tBusy%%: %u\n", c->busy); + count += sysfs_emit_at(buf, count, + "\tIs busy: %u\n", c->is_busy); + count += sysfs_emit_at(buf, count, + "\tNot preferred: %u\n", + c->not_preferred); + count += sysfs_emit_at(buf, count, + "\tNr running: %u\n", cluster->nrrun); + count += sysfs_emit_at(buf, count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += sysfs_emit_at(buf, count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += sysfs_emit_at(buf, count, + "\tNr isolated CPUs: %u\n", + cluster->nr_isolated_cpus); + count += sysfs_emit_at(buf, count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *, char *); + ssize_t (*store)(struct cluster_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are isolated cpus + * in this cluster. + */ + if (prev_cluster->nr_isolated_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* unisolate all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * unisolate as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only unisolate more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_isolate_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && !cpu_isolated(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_isolated_cpus)); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + int high_irqload = sched_cpu_high_irqload(c->cpu); + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + high_irqload) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy, high_irqload); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(struct cluster_data *cluster) +{ + unsigned long flags; + + spin_lock_irqsave(&cluster->pending_lock, flags); + cluster->pending = true; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + wake_up_process(cluster->core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } else { + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(cluster); + } +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_isolate(struct cluster_data *cluster, unsigned int need) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_isolated = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't isolate busy CPUs. */ + if (c->is_busy) + continue; + + /* + * We isolate only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for isolation. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force isolation of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_isolated = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to isolate CPU%u\n", c->cpu); + if (!sched_isolate_cpu(c->cpu)) { + c->isolated_by_us = true; + move_cpu_lru(c); + nr_isolated++; + } else { + pr_debug("Unable to isolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus += nr_isolated; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_unisolate(struct cluster_data *cluster, + unsigned int need, bool force) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unisolated = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->isolated_by_us) + continue; + if ((cpu_online(c->cpu) && !cpu_isolated(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to unisolate CPU%u\n", c->cpu); + if (!sched_unisolate_cpu(c->cpu)) { + c->isolated_by_us = false; + move_cpu_lru(c); + nr_unisolated++; + } else { + pr_debug("Unable to unisolate CPU%u\n", c->cpu); + } + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_isolated_cpus -= nr_unisolated; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_unisolate(struct cluster_data *cluster, unsigned int need) +{ + bool force_use_non_preferred = false; + + __try_to_unisolate(cluster, need, force_use_non_preferred); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_unisolate(cluster, need, force_use_non_preferred); +} + +static void __ref do_core_ctl(struct cluster_data *cluster) +{ + unsigned int need; + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_isolate(cluster, need); + else if (cluster->active_cpus < need) + try_to_unisolate(cluster, need); + } +} + +static int __ref try_core_ctl(void *data) +{ + struct cluster_data *cluster = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&cluster->pending_lock, flags); + if (!cluster->pending) { + spin_unlock_irqrestore(&cluster->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&cluster->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + cluster->pending = false; + spin_unlock_irqrestore(&cluster->pending_lock, flags); + + do_core_ctl(cluster); + } + + return 0; +} + +static int isolation_cpuhp_state(unsigned int cpu, bool online) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cluster_data *cluster = state->cluster; + unsigned int need; + bool do_wakeup = false, unisolated = false; + unsigned long flags; + + if (unlikely(!cluster || !cluster->inited)) + return 0; + + if (online) { + cluster->active_cpus = get_active_cpu_count(cluster); + + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + move_cpu_lru(state); + } else { + /* + * We don't want to have a CPU both offline and isolated. + * So unisolate a CPU that went down if it was isolated by us. + */ + if (state->isolated_by_us) { + sched_unisolate_cpu_unlocked(cpu); + state->isolated_by_us = false; + unisolated = true; + } + + /* Move a CPU to the end of the LRU when it goes offline. */ + move_cpu_lru(state); + + state->busy = 0; + cluster->active_cpus = get_active_cpu_count(cluster); + } + + need = apply_limits(cluster, cluster->need_cpus); + spin_lock_irqsave(&state_lock, flags); + if (unisolated) + cluster->nr_isolated_cpus--; + do_wakeup = adjustment_possible(cluster, need); + spin_unlock_irqrestore(&state_lock, flags); + if (do_wakeup) + wake_up_core_ctl_thread(cluster); + + return 0; +} + +static int core_ctl_isolation_online_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, true); +} + +static int core_ctl_isolation_dead_cpu(unsigned int cpu) +{ + return isolation_cpuhp_state(cpu, false); +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + INIT_LIST_HEAD(&cluster->lru); + spin_lock_init(&cluster->pending_lock); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->core_ctl_thread = kthread_run(try_core_ctl, (void *) cluster, + "core_ctl/%d", first_cpu); + if (IS_ERR(cluster->core_ctl_thread)) + return PTR_ERR(cluster->core_ctl_thread); + + sched_setscheduler_nocheck(cluster->core_ctl_thread, SCHED_FIFO, + ¶m); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +static int __init core_ctl_init(void) +{ + struct sched_cluster *cluster; + int ret; + + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "core_ctl/isolation:online", + core_ctl_isolation_online_cpu, NULL); + + cpuhp_setup_state_nocalls(CPUHP_CORE_CTL_ISOLATION_DEAD, + "core_ctl/isolation:dead", + NULL, core_ctl_isolation_dead_cpu); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + return 0; +} + +late_initcall(core_ctl_init); diff --git a/kernel/sched/core_ctl.h b/kernel/sched/core_ctl.h new file mode 100755 index 0000000000000000000000000000000000000000..0be55ac6a526ba8c9ad246e1b674e4f26c17728a --- /dev/null +++ b/kernel/sched/core_ctl.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019-2020, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#ifdef CONFIG_SCHED_CORE_CTRL +void core_ctl_check(u64 wallclock); +int core_ctl_set_boost(bool boost); +#else +static inline void core_ctl_check(u64 wallclock) {} +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +#endif +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 458d359f5991ca7977fb655da4cbb1f71b53bebc..035ed07966e0c09c1d694599181b8becdb1a1602 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -159,8 +159,12 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) struct rq *rq = cpu_rq(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); +#ifdef CONFIG_SCHED_WALT + cpu_util_freq_walt(sg_cpu->cpu); +#else sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, FREQUENCY_UTIL, NULL); +#endif } /** @@ -448,7 +452,12 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) ignore_dl_rate_limit(sg_cpu); +#ifdef CONFIG_SCHED_WALT + if ((sugov_should_update_freq(sg_policy, time)) + && !(flags & SCHED_CPUFREQ_CONTINUE)) { +#else if (sugov_should_update_freq(sg_policy, time)) { +#endif next_f = sugov_next_freq_shared(sg_cpu, time); if (!sugov_update_next_freq(sg_policy, time, next_f)) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index af7952f12e6cf19ac9b953fbbf54df823372f497..46b5a71f49b271a92d92674d33e769b1a061e2c4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,6 +6,7 @@ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include #endif +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -56,11 +57,18 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) unsigned int pc; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; pc = irq_count() - offset; @@ -75,6 +83,13 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); +#ifdef CONFIG_SCHED_WALT + else + account = false; + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); +#endif } static u64 irqtime_tick_accounted(u64 maxtime) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d78f2e8769fb4ca428ddc6b407fcdef92dd13eb8..f1a58834527c44af0b81d89d2d792f2b7b5014ca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include +#include "walt.h" /* * Default limits for DL period; on the top end we guard against small util @@ -1502,6 +1503,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1516,6 +1518,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -2752,6 +2755,9 @@ DEFINE_SCHED_CLASS(dl) = { #ifdef CONFIG_SCHED_CORE .task_is_throttled = task_is_throttled_dl, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 83932b92dbb392d4e211b6af24292e84f3e1aa89..4a60da9af1ac59648a4e58ac6cd30c22c0df3fca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -793,6 +793,17 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); +#ifdef CONFIG_SCHED_WALT + P(cluster->load_scale_factor); + P(cluster->capacity); + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + SEQ_printf(m, " .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg", + rq->walt_stats.cumulative_runnable_avg_scaled); +#endif #undef P #undef PN @@ -867,6 +878,12 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_base_slice); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_WALT + P(sched_init_task_load_windows); + P(min_capacity); + P(max_capacity); + P(sched_ravg_window); +#endif #undef PN #undef P @@ -1040,6 +1057,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_affine_attempts); P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); +#ifdef CONFIG_SCHED_WALT + P(ravg.demand); +#endif avg_atom = p->se.sum_exec_runtime; if (nr_switches) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 566a4708cfd2751d3740e4710040b4fb19adc178..f722e118ea2bfc39878cd0aab0ece77b00d21f94 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -56,6 +56,34 @@ #include "sched.h" #include "stats.h" #include "autogroup.h" +#include "walt.h" + +#ifdef CONFIG_SCHED_WALT +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif + +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH) +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq); +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, + struct task_struct *p); +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +#else +static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {} +static inline void +walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} +static inline void +walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +#define walt_inc_throttled_cfs_rq_stats(...) +#define walt_dec_throttled_cfs_rq_stats(...) + +#endif /* * Targeted preemption latency for CPU-bound tasks: @@ -4753,6 +4781,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return READ_ONCE(p->se.avg.util_avg); } @@ -4765,6 +4797,10 @@ static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) + return p->ravg.demand_scaled; +#endif return max(task_util(p), _task_util_est(p)); } @@ -5758,10 +5794,12 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); } /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta); + walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq); done: /* @@ -5781,6 +5819,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5827,6 +5866,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5844,6 +5884,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5852,6 +5893,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); @@ -6336,6 +6378,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); #endif + walt_init_cfs_rq_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6698,7 +6741,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - + walt_inc_cfs_rq_stats(cfs_rq, p); if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6718,7 +6761,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - + walt_inc_cfs_rq_stats(cfs_rq, p); if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6729,7 +6772,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - + inc_rq_walt_stats(rq, p); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the @@ -6781,7 +6824,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - + walt_dec_cfs_rq_stats(cfs_rq, p); if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6813,7 +6856,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - + walt_dec_cfs_rq_stats(cfs_rq, p); if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -6825,6 +6868,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + dec_rq_walt_stats(rq, p); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -7641,6 +7685,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); unsigned long runnable; +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + u64 walt_cpu_util = + cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, + capacity_orig_of(cpu)); + } +#endif + if (boost) { runnable = READ_ONCE(cfs_rq->avg.runnable_avg); util = max(util, runnable); @@ -7724,11 +7778,29 @@ unsigned long cpu_util_cfs_boost(int cpu) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { + unsigned int util; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util) && + p->__state == TASK_WAKING) + return cpu_util_cfs(cpu); +#endif /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) p = NULL; return cpu_util(cpu, p, -1, 0); +#ifdef CONFIG_SCHED_WALT + if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) { + util = max_t(long, cpu_util_cfs(cpu) - task_util(p), 0); + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } +#endif } /* @@ -7804,6 +7876,18 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, eenv->pd_busy_time = min(eenv->pd_cap, busy_time); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + /* * Compute the maximum utilization for compute_energy() when the task @p * is placed on the cpu @dst_cpu. @@ -9044,7 +9128,15 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_rq_held(env->src_rq); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); +#ifdef CONFIG_SCHED_WALT + double_lock_balance(env->src_rq, env->dst_rq); + if (!(env->src_rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(env->src_rq); +#endif set_task_cpu(p, env->dst_cpu); +#ifdef CONFIG_SCHED_WALT + double_unlock_balance(env->src_rq, env->dst_rq); +#endif } /* @@ -13212,6 +13304,10 @@ DEFINE_SCHED_CLASS(fair) = { #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, +#endif + }; #ifdef CONFIG_SCHED_DEBUG @@ -13277,3 +13373,91 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +/* WALT sched implementation begins here */ +#ifdef CONFIG_SCHED_WALT + +#ifdef CONFIG_CFS_BANDWIDTH + +static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->walt_stats.cumulative_runnable_avg_scaled = 0; +} + +static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + p->ravg.demand_scaled); +} + +static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) +{ + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + -(s64)p->ravg.demand_scaled); +} + +static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + +} + +static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *tcfs_rq) +{ + struct rq *rq = rq_of(tcfs_rq); + + fixup_cumulative_runnable_avg(stats, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); + + /* + * We remove the throttled cfs_rq's tasks's contribution from the + * cumulative window demand so that the same can be added + * unconditionally when the cfs_rq is unthrottled. + */ + if (stats == &rq->walt_stats) + walt_fixup_cum_window_demand(rq, + -tcfs_rq->walt_stats.cumulative_runnable_avg_scaled); +} + +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, + task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->walt_stats, + task_load_delta); + walt_fixup_cum_window_demand(rq, task_load_delta); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + fixup_walt_sched_stats_common(rq, p, updated_demand_scaled); +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4ac36eb4cdee582410b267ffd9617f51d22ec5fc..758bb5d522d11aa721cba9d7149df2ff3bfab039 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,7 +3,9 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ +#include "sched.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -1547,6 +1549,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1558,6 +1561,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } @@ -2744,6 +2748,9 @@ DEFINE_SCHED_CLASS(rt) = { #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/rtg/Kconfig b/kernel/sched/rtg/Kconfig new file mode 100755 index 0000000000000000000000000000000000000000..1cb0c4298b097e03c6860e37eea3bde664fea260 --- /dev/null +++ b/kernel/sched/rtg/Kconfig @@ -0,0 +1,40 @@ +menu "Related Thread Group" + +config SCHED_RTG + bool "Related Thread Group" + depends on SCHED_WALT + default n + help + Set related threads into a group. + +config SCHED_RTG_DEBUG + bool "Related Thread Group DebugFS" + depends on SCHED_RTG + default n + help + If set, debug node will show rtg threads + +config SCHED_RTG_CGROUP + bool "enable DEFAULT_CGROUP_COLOC RTG" + depends on SCHED_RTG + default n + help + If set, support for adding the tasks which belong to + co-located cgroup to DEFAULT_CGROUP_COLOC RTG. + +config SCHED_RTG_FRAME + bool "Frame-based Related Thread Group" + depends on SCHED_RTG + default n + help + Support frame-based related thread group scheduling. + If set, you can set the task to RTG and kernel will + statistic the load per frame. + +config SCHED_RTG_RT_THREAD_LIMIT + bool "Limit the number of RT threads in groups" + depends on SCHED_RTG_FRAME + default n + help + If set, limit the number of RT threads in frame RTG. +endmenu diff --git a/kernel/sched/rtg/Makefile b/kernel/sched/rtg/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..4d55523d1f32b8acb0404b943de3cb407d7b3832 --- /dev/null +++ b/kernel/sched/rtg/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SCHED_RTG) += rtg.o +obj-$(CONFIG_SCHED_RTG_FRAME) += frame_rtg.o rtg_ctrl.o diff --git a/kernel/sched/rtg/frame_rtg.c b/kernel/sched/rtg/frame_rtg.c new file mode 100755 index 0000000000000000000000000000000000000000..79db645228c42352e8e49847f09d79b67fdd6bfd --- /dev/null +++ b/kernel/sched/rtg/frame_rtg.c @@ -0,0 +1,1229 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Frame-based load tracking for rt_frame and RTG + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#include "frame_rtg.h" +#include "rtg.h" + +#include +#include +#include <../kernel/sched/sched.h> +#include + +static struct multi_frame_id_manager g_id_manager = { + .id_map = {0}, + .offset = 0, + .lock = __RW_LOCK_UNLOCKED(g_id_manager.lock) +}; + +static struct frame_info g_multi_frame_info[MULTI_FRAME_NUM]; + +static bool is_rtg_rt_task(struct task_struct *task) +{ + bool ret = false; + + if (!task) + return ret; + + ret = ((task->prio < MAX_RT_PRIO) && + (task->rtg_depth == STATIC_RTG_DEPTH)); + + return ret; +} + +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT +static atomic_t g_rtg_rt_thread_num = ATOMIC_INIT(0); + +static unsigned int _get_rtg_rt_thread_num(struct related_thread_group *grp) +{ + unsigned int rtg_rt_thread_num = 0; + struct task_struct *p = NULL; + + if (list_empty(&grp->tasks)) + goto out; + + list_for_each_entry(p, &grp->tasks, grp_list) { + if (is_rtg_rt_task(p)) + ++rtg_rt_thread_num; + } + +out: + return rtg_rt_thread_num; +} + +static unsigned int get_rtg_rt_thread_num(void) +{ + struct related_thread_group *grp = NULL; + unsigned int total_rtg_rt_thread_num = 0; + unsigned long flag; + unsigned int i; + + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + grp = lookup_related_thread_group(i); + if (grp == NULL) + continue; + raw_spin_lock_irqsave(&grp->lock, flag); + total_rtg_rt_thread_num += _get_rtg_rt_thread_num(grp); + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + + return total_rtg_rt_thread_num; +} + +static void inc_rtg_rt_thread_num(void) +{ + atomic_inc(&g_rtg_rt_thread_num); +} + +static void dec_rtg_rt_thread_num(void) +{ + atomic_dec_if_positive(&g_rtg_rt_thread_num); +} + +static int test_and_read_rtg_rt_thread_num(void) +{ + if (atomic_read(&g_rtg_rt_thread_num) >= RTG_MAX_RT_THREAD_NUM) + atomic_set(&g_rtg_rt_thread_num, get_rtg_rt_thread_num()); + + return atomic_read(&g_rtg_rt_thread_num); +} + +int read_rtg_rt_thread_num(void) +{ + return atomic_read(&g_rtg_rt_thread_num); +} +#else +static inline void inc_rtg_rt_thread_num(void) { } +static inline void dec_rtg_rt_thread_num(void) { } +static inline int test_and_read_rtg_rt_thread_num(void) +{ + return 0; +} +#endif + +bool is_frame_rtg(int id) +{ + return (id >= MULTI_FRAME_ID) && + (id < (MULTI_FRAME_ID + MULTI_FRAME_NUM)); +} + +static struct related_thread_group *frame_rtg(int id) +{ + if (!is_frame_rtg(id)) + return NULL; + + return lookup_related_thread_group(id); +} + +struct frame_info *rtg_frame_info(int id) +{ + if (!is_frame_rtg(id)) + return NULL; + + return rtg_active_multi_frame_info(id); +} + +static int alloc_rtg_id(void) +{ + unsigned int id_offset; + int id; + + write_lock(&g_id_manager.lock); + id_offset = find_next_zero_bit(g_id_manager.id_map, MULTI_FRAME_NUM, + g_id_manager.offset); + if (id_offset >= MULTI_FRAME_NUM) { + id_offset = find_first_zero_bit(g_id_manager.id_map, + MULTI_FRAME_NUM); + if (id_offset >= MULTI_FRAME_NUM) { + write_unlock(&g_id_manager.lock); + return -EINVAL; + } + } + + set_bit(id_offset, g_id_manager.id_map); + g_id_manager.offset = id_offset; + id = id_offset + MULTI_FRAME_ID; + write_unlock(&g_id_manager.lock); + pr_debug("[FRAME_RTG] %s id_offset=%u, id=%d\n", __func__, id_offset, id); + + return id; +} + +static void free_rtg_id(int id) +{ + unsigned int id_offset = id - MULTI_FRAME_ID; + + if (id_offset >= MULTI_FRAME_NUM) { + pr_err("[FRAME_RTG] %s id_offset is invalid, id=%d, id_offset=%u.\n", + __func__, id, id_offset); + return; + } + + pr_debug("[FRAME_RTG] %s id=%d id_offset=%u\n", __func__, id, id_offset); + write_lock(&g_id_manager.lock); + clear_bit(id_offset, g_id_manager.id_map); + write_unlock(&g_id_manager.lock); +} + +int set_frame_rate(struct frame_info *frame_info, int rate) +{ + int id; + + if ((rate < MIN_FRAME_RATE) || (rate > MAX_FRAME_RATE)) { + pr_err("[FRAME_RTG]: %s invalid QOS(rate) value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->frame_rate = (unsigned int)rate; + frame_info->frame_time = div_u64(NSEC_PER_SEC, rate); + frame_info->max_vload_time = + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + + frame_info->vload_margin; + id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_QOS", rate); + trace_rtg_frame_sched(id, "FRAME_MAX_TIME", frame_info->max_vload_time); + + return 0; +} + +int alloc_multi_frame_info(void) +{ + struct frame_info *frame_info = NULL; + int id; + int i; + + id = alloc_rtg_id(); + if (id < 0) + return id; + + frame_info = rtg_frame_info(id); + if (!frame_info) { + free_rtg_id(id); + return -EINVAL; + } + + set_frame_rate(frame_info, DEFAULT_FRAME_RATE); + atomic_set(&frame_info->curr_rt_thread_num, 0); + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); + for (i = 0; i < MAX_TID_NUM; i++) + atomic_set(&frame_info->thread_prio[i], 0); + + return id; +} + +void release_multi_frame_info(int id) +{ + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) { + pr_err("[FRAME_RTG] %s frame(id=%d) not found.\n", __func__, id); + return; + } + + read_lock(&g_id_manager.lock); + if (!test_bit(id - MULTI_FRAME_ID, g_id_manager.id_map)) { + read_unlock(&g_id_manager.lock); + return; + } + read_unlock(&g_id_manager.lock); + + pr_debug("[FRAME_RTG] %s release frame(id=%d).\n", __func__, id); + free_rtg_id(id); +} + +void clear_multi_frame_info(void) +{ + write_lock(&g_id_manager.lock); + bitmap_zero(g_id_manager.id_map, MULTI_FRAME_NUM); + g_id_manager.offset = 0; + write_unlock(&g_id_manager.lock); +} + +struct frame_info *rtg_active_multi_frame_info(int id) +{ + struct frame_info *frame_info = NULL; + + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) + return NULL; + + read_lock(&g_id_manager.lock); + if (test_bit(id - MULTI_FRAME_ID, g_id_manager.id_map)) + frame_info = &g_multi_frame_info[id - MULTI_FRAME_ID]; + read_unlock(&g_id_manager.lock); + if (!frame_info) + pr_debug("[FRAME_RTG] %s frame %d has been released\n", + __func__, id); + + return frame_info; +} + +struct frame_info *rtg_multi_frame_info(int id) +{ + if ((id < MULTI_FRAME_ID) || (id >= MULTI_FRAME_ID + MULTI_FRAME_NUM)) + return NULL; + + return &g_multi_frame_info[id - MULTI_FRAME_ID]; +} + +static void do_update_frame_task_prio(struct frame_info *frame_info, + struct task_struct *task, int prio) +{ + int policy = SCHED_NORMAL; + struct sched_param sp = {0}; + bool is_rt_task = (prio != NOT_RT_PRIO); + bool need_dec_flag = false; + bool need_inc_flag = false; + int err; + + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + /* change policy to RT */ + if (is_rt_task && (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num))) { + /* change policy from CFS to RT */ + if (!is_rtg_rt_task(task)) { + if (test_and_read_rtg_rt_thread_num() >= RTG_MAX_RT_THREAD_NUM) + goto out; + need_inc_flag = true; + } + /* change RT priority */ + policy = SCHED_FIFO | SCHED_RESET_ON_FORK; + sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; + atomic_inc(&frame_info->curr_rt_thread_num); + } else { + /* change policy from RT to CFS */ + if (!is_rt_task && is_rtg_rt_task(task)) + need_dec_flag = true; + } +out: + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + err = sched_setscheduler_nocheck(task, policy, &sp); + if (err == 0) { + if (need_dec_flag) + dec_rtg_rt_thread_num(); + else if (need_inc_flag) + inc_rtg_rt_thread_num(); + } +} + +int list_rtg_group(struct rtg_info *rs_data) +{ + int i; + int num = 0; + + read_lock(&g_id_manager.lock); + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + if (test_bit(i - MULTI_FRAME_ID, g_id_manager.id_map)) { + rs_data->rtgs[num] = i; + num++; + } + } + read_unlock(&g_id_manager.lock); + rs_data->rtg_num = num; + + return num; +} + +int search_rtg(int pid) +{ + struct rtg_info grp_info; + struct frame_info *frame_info = NULL; + int i = 0; + int j = 0; + + grp_info.rtg_num = 0; + read_lock(&g_id_manager.lock); + for (i = MULTI_FRAME_ID; i < MULTI_FRAME_ID + MULTI_FRAME_NUM; i++) { + if (test_bit(i - MULTI_FRAME_ID, g_id_manager.id_map)) { + grp_info.rtgs[grp_info.rtg_num] = i; + grp_info.rtg_num++; + } + } + read_unlock(&g_id_manager.lock); + for (i = 0; i < grp_info.rtg_num; i++) { + frame_info = lookup_frame_info_by_grp_id(grp_info.rtgs[i]); + if (!frame_info) { + pr_err("[FRAME_RTG] unexpected grp %d find error.", i); + return -EINVAL; + } + + for (j = 0; j < frame_info->thread_num; j++) { + if (frame_info->thread[j] && frame_info->thread[j]->pid == pid) + return grp_info.rtgs[i]; + } + } + + return 0; +} + +static void update_frame_task_prio(struct frame_info *frame_info, int prio) +{ + int i; + struct task_struct *thread = NULL; + + /* reset curr_rt_thread_num */ + atomic_set(&frame_info->curr_rt_thread_num, 0); + + for (i = 0; i < MAX_TID_NUM; i++) { + thread = frame_info->thread[i]; + if (thread) + do_update_frame_task_prio(frame_info, thread, prio); + } +} + +void set_frame_prio(struct frame_info *frame_info, int prio) +{ + if (!frame_info) + return; + + mutex_lock(&frame_info->lock); + if (frame_info->prio == prio) + goto out; + + update_frame_task_prio(frame_info, prio); + frame_info->prio = prio; +out: + mutex_unlock(&frame_info->lock); +} + +static int do_set_rtg_sched(struct task_struct *task, bool is_rtg, + int grp_id, int prio) +{ + int err; + int policy = SCHED_NORMAL; + int grpid = DEFAULT_RTG_GRP_ID; + bool is_rt_task = (prio != NOT_RT_PRIO); + struct sched_param sp = {0}; + + if (is_rtg) { + if (is_rt_task) { + if (test_and_read_rtg_rt_thread_num() >= RTG_MAX_RT_THREAD_NUM) + // rtg_rt_thread_num is inavailable, set policy to CFS + goto skip_setpolicy; + policy = SCHED_FIFO | SCHED_RESET_ON_FORK; + sp.sched_priority = MAX_USER_RT_PRIO - 1 - prio; + } +skip_setpolicy: + grpid = grp_id; + } + err = sched_setscheduler_nocheck(task, policy, &sp); + if (err < 0) { + pr_err("[FRAME_RTG]: %s task:%d setscheduler err:%d\n", + __func__, task->pid, err); + return err; + } + err = sched_set_group_id(task, grpid); + if (err < 0) { + pr_err("[FRAME_RTG]: %s task:%d set_group_id err:%d\n", + __func__, task->pid, err); + if (is_rtg) { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + sched_setscheduler_nocheck(task, policy, &sp); + } + } + if (err == 0) { + if (is_rtg) { + if (policy != SCHED_NORMAL) + inc_rtg_rt_thread_num(); + } else { + dec_rtg_rt_thread_num(); + } + } + + return err; +} + +static int set_rtg_sched(struct task_struct *task, bool is_rtg, + int grp_id, int prio) +{ + int err = -1; + bool is_rt_task = (prio != NOT_RT_PRIO); + + if (!task) + return err; + + if (is_rt_task && is_rtg && ((prio < 0) || + (prio > MAX_USER_RT_PRIO - 1))) + return err; + /* + * original logic deny the non-cfs task st rt. + * add !fair_policy(task->policy) if needed + * + * if CONFIG_HW_FUTEX_PI is set, task->prio and task->sched_class + * may be modified by rtmutex. So we use task->policy instead. + */ + if (is_rtg && task->flags & PF_EXITING) + return err; + + if (in_interrupt()) { + pr_err("[FRAME_RTG]: %s is in interrupt\n", __func__); + return err; + } + + return do_set_rtg_sched(task, is_rtg, grp_id, prio); +} + +static bool set_frame_rtg_thread(int grp_id, struct task_struct *task, + bool is_rtg, int prio) +{ + int depth; + + if (!task) + return false; + depth = task->rtg_depth; + if (is_rtg) + task->rtg_depth = STATIC_RTG_DEPTH; + else + task->rtg_depth = 0; + + if (set_rtg_sched(task, is_rtg, grp_id, prio) < 0) { + task->rtg_depth = depth; + return false; + } + + return true; +} + +struct task_struct *update_frame_thread(struct frame_info *frame_info, + int old_prio, int prio, int pid, + struct task_struct *old_task) +{ + struct task_struct *task = NULL; + bool is_rt_task = (prio != NOT_RT_PRIO); + int new_prio = prio; + bool update_ret = false; + + if (pid > 0) { + if (old_task && (pid == old_task->pid) && (old_prio == new_prio)) { + if (is_rt_task && atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num) && + (atomic_read(&frame_info->frame_sched_state) == 1)) + atomic_inc(&frame_info->curr_rt_thread_num); + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + return old_task; + } + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + } + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", + atomic_read(&frame_info->frame_sched_state)); + if (atomic_read(&frame_info->frame_sched_state) == 1) { + if (task && is_rt_task) { + if (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + else + new_prio = NOT_RT_PRIO; + } + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + + set_frame_rtg_thread(frame_info->rtg->id, old_task, false, NOT_RT_PRIO); + update_ret = set_frame_rtg_thread(frame_info->rtg->id, task, true, new_prio); + } + if (old_task) + put_task_struct(old_task); + if (!update_ret) + return NULL; + + return task; +} + +void update_frame_thread_info(struct frame_info *frame_info, + struct frame_thread_info *frame_thread_info) +{ + int i; + int old_prio; + int prio; + int thread_num; + int real_thread; + + if (!frame_info || !frame_thread_info || + frame_thread_info->thread_num < 0) + return; + + prio = frame_thread_info->prio; + thread_num = frame_thread_info->thread_num; + if (thread_num > MAX_TID_NUM) + thread_num = MAX_TID_NUM; + + // reset curr_rt_thread_num + atomic_set(&frame_info->curr_rt_thread_num, 0); + mutex_lock(&frame_info->lock); + old_prio = frame_info->prio; + real_thread = 0; + for (i = 0; i < thread_num; i++) { + atomic_set(&frame_info->thread_prio[i], 0); + frame_info->thread[i] = update_frame_thread(frame_info, old_prio, prio, + frame_thread_info->thread[i], + frame_info->thread[i]); + if (frame_info->thread[i] && (frame_thread_info->thread[i] > 0)) + real_thread++; + } + frame_info->prio = prio; + frame_info->thread_num = real_thread; + mutex_unlock(&frame_info->lock); +} + +static void do_set_frame_sched_state(struct frame_info *frame_info, + struct task_struct *task, + bool enable, int prio) +{ + int new_prio = prio; + bool is_rt_task = (prio != NOT_RT_PRIO); + + if (enable && is_rt_task) { + if (atomic_read(&frame_info->curr_rt_thread_num) < + atomic_read(&frame_info->max_rt_thread_num)) + atomic_inc(&frame_info->curr_rt_thread_num); + else + new_prio = NOT_RT_PRIO; + } + trace_rtg_frame_sched(frame_info->rtg->id, "curr_rt_thread_num", + atomic_read(&frame_info->curr_rt_thread_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "rtg_rt_thread_num", + read_rtg_rt_thread_num()); + set_frame_rtg_thread(frame_info->rtg->id, task, enable, new_prio); +} + +void set_frame_sched_state(struct frame_info *frame_info, bool enable) +{ + atomic_t *frame_sched_state = NULL; + int prio; + int i; + + if (!frame_info || !frame_info->rtg) + return; + + frame_sched_state = &(frame_info->frame_sched_state); + if (enable) { + if (atomic_read(frame_sched_state) == 1) + return; + atomic_set(frame_sched_state, 1); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 1); + + frame_info->prev_fake_load_util = 0; + frame_info->prev_frame_load_util = 0; + frame_info->frame_vload = 0; + frame_info_rtg_load(frame_info)->curr_window_load = 0; + } else { + if (atomic_read(frame_sched_state) == 0) + return; + atomic_set(frame_sched_state, 0); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 0); + + (void)sched_set_group_normalized_util(frame_info->rtg->id, + 0, RTG_FREQ_NORMAL_UPDATE); + trace_rtg_frame_sched(frame_info->rtg->id, "preferred_cluster", + INVALID_PREFERRED_CLUSTER); + frame_info->status = FRAME_END; + } + + /* reset curr_rt_thread_num */ + atomic_set(&frame_info->curr_rt_thread_num, 0); + mutex_lock(&frame_info->lock); + for (i = 0; i < MAX_TID_NUM; i++) { + if (frame_info->thread[i]) { + prio = atomic_read(&frame_info->thread_prio[i]); + do_set_frame_sched_state(frame_info, frame_info->thread[i], + enable, prio); + } + } + mutex_unlock(&frame_info->lock); + + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_STATUS", + frame_info->status); + trace_rtg_frame_sched(frame_info->rtg->id, "frame_status", + frame_info->status); +} + +static inline bool check_frame_util_invalid(const struct frame_info *frame_info, + u64 timeline) +{ + return ((frame_info_rtg(frame_info)->util_invalid_interval <= timeline) && + (frame_info_rtg_load(frame_info)->curr_window_exec * FRAME_UTIL_INVALID_FACTOR + <= timeline)); +} + +static u64 calc_prev_fake_load_util(const struct frame_info *frame_info) +{ + u64 prev_frame_load = frame_info->prev_frame_load; + u64 prev_frame_time = max_t(unsigned long, frame_info->prev_frame_time, + frame_info->frame_time); + u64 frame_util = 0; + + if (prev_frame_time > 0) + frame_util = div_u64((prev_frame_load << SCHED_CAPACITY_SHIFT), + prev_frame_time); + frame_util = clamp_t(unsigned long, frame_util, + frame_info->prev_min_util, + frame_info->prev_max_util); + + return frame_util; +} + +static u64 calc_prev_frame_load_util(const struct frame_info *frame_info) +{ + u64 prev_frame_load = frame_info->prev_frame_load; + u64 frame_time = frame_info->frame_time; + u64 frame_util = 0; + + if (prev_frame_load >= frame_time) + frame_util = FRAME_MAX_LOAD; + else + frame_util = div_u64((prev_frame_load << SCHED_CAPACITY_SHIFT), + frame_info->frame_time); + frame_util = clamp_t(unsigned long, frame_util, + frame_info->prev_min_util, + frame_info->prev_max_util); + + return frame_util; +} + +/* last frame load tracking */ +static void update_frame_prev_load(struct frame_info *frame_info, bool fake) +{ + /* last frame load tracking */ + frame_info->prev_frame_exec = + frame_info_rtg_load(frame_info)->prev_window_exec; + frame_info->prev_frame_time = + frame_info_rtg(frame_info)->prev_window_time; + frame_info->prev_frame_load = + frame_info_rtg_load(frame_info)->prev_window_load; + + if (fake) + frame_info->prev_fake_load_util = + calc_prev_fake_load_util(frame_info); + else + frame_info->prev_frame_load_util = + calc_prev_frame_load_util(frame_info); +} + +static void do_frame_end(struct frame_info *frame_info, bool fake) +{ + unsigned long prev_util; + int id = frame_info->rtg->id; + + frame_info->status = FRAME_END; + trace_rtg_frame_sched(id, "frame_status", frame_info->status); + + /* last frame load tracking */ + update_frame_prev_load(frame_info, fake); + + /* reset frame_info */ + frame_info->frame_vload = 0; + + /* reset frame_min_util */ + frame_info->frame_min_util = 0; + + if (fake) + prev_util = frame_info->prev_fake_load_util; + else + prev_util = frame_info->prev_frame_load_util; + + frame_info->frame_util = clamp_t(unsigned long, prev_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + + trace_rtg_frame_sched(id, "frame_last_task_time", + frame_info->prev_frame_exec); + trace_rtg_frame_sched(id, "frame_last_time", frame_info->prev_frame_time); + trace_rtg_frame_sched(id, "frame_last_load", frame_info->prev_frame_load); + trace_rtg_frame_sched(id, "frame_last_load_util", + frame_info->prev_frame_load_util); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); +} + +/* + * frame_load : calculate frame load using exec util + */ +static inline u64 calc_frame_exec(const struct frame_info *frame_info) +{ + if (frame_info->frame_time > 0) + return div_u64((frame_info_rtg_load(frame_info)->curr_window_exec << + SCHED_CAPACITY_SHIFT), frame_info->frame_time); + else + return 0; +} + +/* + * real_util: + * max(last_util, virtual_util, boost_util, phase_util, frame_min_util) + */ +static u64 calc_frame_util(const struct frame_info *frame_info, bool fake) +{ + unsigned long load_util; + + if (fake) + load_util = frame_info->prev_fake_load_util; + else + load_util = frame_info->prev_frame_load_util; + + load_util = max_t(unsigned long, load_util, frame_info->frame_vload); + load_util = clamp_t(unsigned long, load_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + + return load_util; +} + +/* + * frame_vload [0~1024] + * vtime: now - timestamp + * max_time: frame_info->frame_time + vload_margin + * load = F(vtime) + * = vtime ^ 2 - vtime * max_time + FRAME_MAX_VLOAD * vtime / max_time; + * = vtime * (vtime + FRAME_MAX_VLOAD / max_time - max_time); + * [0, 0] -=> [max_time, FRAME_MAX_VLOAD] + * + */ +static u64 calc_frame_vload(const struct frame_info *frame_info, u64 timeline) +{ + u64 vload; + int vtime = div_u64(timeline, NSEC_PER_MSEC); + int max_time = frame_info->max_vload_time; + int factor; + + if ((max_time <= 0) || (vtime > max_time)) + return FRAME_MAX_VLOAD; + + factor = vtime + FRAME_MAX_VLOAD / max_time; + /* margin maybe negative */ + if ((vtime <= 0) || (factor <= max_time)) + return 0; + + vload = (u64)vtime * (u64)(factor - max_time); + + return vload; +} + +static int update_frame_info_tick_inner(int id, struct frame_info *frame_info, + u64 timeline) +{ + switch (frame_info->status) { + case FRAME_INVALID: + case FRAME_END: + if (timeline >= frame_info->frame_time) { + /* + * fake FRAME_END here to rollover frame_window. + */ + sched_set_group_window_rollover(id); + do_frame_end(frame_info, true); + } else { + frame_info->frame_vload = calc_frame_exec(frame_info); + frame_info->frame_util = + calc_frame_util(frame_info, true); + } + + /* when not in boost, start tick timer */ + break; + case FRAME_START: + /* check frame_util invalid */ + if (!check_frame_util_invalid(frame_info, timeline)) { + /* frame_vload statistic */ + frame_info->frame_vload = calc_frame_vload(frame_info, timeline); + /* frame_util statistic */ + frame_info->frame_util = + calc_frame_util(frame_info, false); + } else { + frame_info->status = FRAME_INVALID; + trace_rtg_frame_sched(id, "FRAME_STATUS", + frame_info->status); + trace_rtg_frame_sched(id, "frame_status", + frame_info->status); + + /* + * trigger FRAME_END to rollover frame_window, + * we treat FRAME_INVALID as FRAME_END. + */ + sched_set_group_window_rollover(id); + do_frame_end(frame_info, false); + } + break; + default: + return -EINVAL; + } + + return 0; +} + +static inline struct frame_info *rtg_frame_info_inner( + const struct related_thread_group *grp) +{ + return (struct frame_info *)grp->private_data; +} + +static inline void frame_boost(struct frame_info *frame_info) +{ + if (frame_info->frame_util < frame_info->frame_boost_min_util) + frame_info->frame_util = frame_info->frame_boost_min_util; +} + +/* + * update CPUFREQ and PLACEMENT when frame task running (in tick) and migration + */ +static void update_frame_info_tick(struct related_thread_group *grp) +{ + u64 window_start; + u64 wallclock; + u64 timeline; + struct frame_info *frame_info = NULL; + int id = grp->id; + + rcu_read_lock(); + frame_info = rtg_frame_info_inner(grp); + window_start = grp->window_start; + rcu_read_unlock(); + if (unlikely(!frame_info)) + return; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return; + trace_rtg_frame_sched(id, "frame_status", frame_info->status); + + wallclock = ktime_get_ns(); + timeline = wallclock - window_start; + + trace_rtg_frame_sched(id, "update_curr_pid", current->pid); + trace_rtg_frame_sched(id, "frame_timeline", div_u64(timeline, NSEC_PER_MSEC)); + + if (update_frame_info_tick_inner(grp->id, frame_info, timeline) == -EINVAL) + return; + + frame_boost(frame_info); + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + + sched_set_group_normalized_util(grp->id, + frame_info->frame_util, RTG_FREQ_NORMAL_UPDATE); + + if (grp->preferred_cluster) + trace_rtg_frame_sched(id, "preferred_cluster", + grp->preferred_cluster->id); +} + +const struct rtg_class frame_rtg_class = { + .sched_update_rtg_tick = update_frame_info_tick, +}; + +int set_frame_margin(struct frame_info *frame_info, int margin) +{ + int id; + + if ((margin < MIN_VLOAD_MARGIN) || (margin > MAX_VLOAD_MARGIN)) { + pr_err("[FRAME_RTG]: %s invalid MARGIN value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->vload_margin = margin; + frame_info->max_vload_time = + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + + frame_info->vload_margin; + id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_MARGIN", -margin); + trace_rtg_frame_sched(id, "FRAME_MAX_TIME", frame_info->max_vload_time); + + return 0; +} + +static void set_frame_start(struct frame_info *frame_info) +{ + int id = frame_info->rtg->id; + + if (likely(frame_info->status == FRAME_START)) { + /* + * START -=> START -=> ...... + * FRMAE_START is + * the end of last frame + * the start of the current frame + */ + update_frame_prev_load(frame_info, false); + } else if ((frame_info->status == FRAME_END) || + (frame_info->status == FRAME_INVALID)) { + /* START -=> END -=> [START] + * FRAME_START is + * only the start of current frame + * we shoudn't tracking the last rtg-window + * [FRAME_END, FRAME_START] + * it's not an available frame window + */ + update_frame_prev_load(frame_info, true); + frame_info->status = FRAME_START; + } + trace_rtg_frame_sched(id, "FRAME_STATUS", frame_info->status); + trace_rtg_frame_sched(id, "frame_last_task_time", + frame_info->prev_frame_exec); + trace_rtg_frame_sched(id, "frame_last_time", frame_info->prev_frame_time); + trace_rtg_frame_sched(id, "frame_last_load", frame_info->prev_frame_load); + trace_rtg_frame_sched(id, "frame_last_load_util", + frame_info->prev_frame_load_util); + + /* new_frame_start */ + if (!frame_info->margin_imme) { + frame_info->frame_vload = 0; + frame_info->frame_util = clamp_t(unsigned long, + frame_info->prev_frame_load_util, + frame_info->frame_min_util, + frame_info->frame_max_util); + } else { + frame_info->frame_vload = calc_frame_vload(frame_info, 0); + frame_info->frame_util = calc_frame_util(frame_info, false); + } + + trace_rtg_frame_sched(id, "frame_vload", frame_info->frame_vload); +} + +static void set_frame_end(struct frame_info *frame_info) +{ + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_STATUS", FRAME_END); + do_frame_end(frame_info, false); +} + +static int update_frame_timestamp(unsigned long status, + struct frame_info *frame_info, struct related_thread_group *grp) +{ + int id = frame_info->rtg->id; + + /* SCHED_FRAME timestamp */ + switch (status) { + case FRAME_START: + /* collect frame_info when frame_end timestamp coming */ + set_frame_start(frame_info); + break; + case FRAME_END: + /* FRAME_END should only set and update freq once */ + if (unlikely(frame_info->status == FRAME_END)) + return 0; + set_frame_end(frame_info); + break; + default: + pr_err("[FRAME_RTG]: %s invalid timestamp(status)\n", + __func__); + return -EINVAL; + } + + frame_boost(frame_info); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + + /* update cpufreq force when frame_stop */ + sched_set_group_normalized_util(grp->id, + frame_info->frame_util, RTG_FREQ_FORCE_UPDATE); + if (grp->preferred_cluster) + trace_rtg_frame_sched(id, "preferred_cluster", + grp->preferred_cluster->id); + + return 0; +} + +static int set_frame_status(struct frame_info *frame_info, unsigned long status) +{ + struct related_thread_group *grp = NULL; + int id; + + if (!frame_info) + return -EINVAL; + + grp = frame_info->rtg; + if (unlikely(!grp)) + return -EINVAL; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return -EINVAL; + + if (!(status & FRAME_SETTIME) || + (status == (unsigned long)FRAME_SETTIME_PARAM)) { + pr_err("[FRAME_RTG]: %s invalid timetsamp(status)\n", + __func__); + return -EINVAL; + } + + if (status & FRAME_TIMESTAMP_SKIP_START) { + frame_info->timestamp_skipped = true; + status &= ~FRAME_TIMESTAMP_SKIP_START; + } else if (status & FRAME_TIMESTAMP_SKIP_END) { + frame_info->timestamp_skipped = false; + status &= ~FRAME_TIMESTAMP_SKIP_END; + } else if (frame_info->timestamp_skipped) { + /* + * skip the following timestamp until + * FRAME_TIMESTAMP_SKIPPED reset + */ + return 0; + } + id = grp->id; + trace_rtg_frame_sched(id, "FRAME_TIMESTAMP_SKIPPED", + frame_info->timestamp_skipped); + trace_rtg_frame_sched(id, "FRAME_MAX_UTIL", frame_info->frame_max_util); + + if (status & FRAME_USE_MARGIN_IMME) { + frame_info->margin_imme = true; + status &= ~FRAME_USE_MARGIN_IMME; + } else { + frame_info->margin_imme = false; + } + trace_rtg_frame_sched(id, "FRAME_MARGIN_IMME", frame_info->margin_imme); + trace_rtg_frame_sched(id, "FRAME_TIMESTAMP", status); + + return update_frame_timestamp(status, frame_info, grp); +} + +int set_frame_timestamp(struct frame_info *frame_info, unsigned long timestamp) +{ + int ret; + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + if (atomic_read(&frame_info->frame_sched_state) == 0) + return -EINVAL; + + ret = sched_set_group_window_rollover(frame_info->rtg->id); + if (!ret) + ret = set_frame_status(frame_info, timestamp); + + return ret; +} + +int set_frame_min_util(struct frame_info *frame_info, int min_util, bool is_boost) +{ + int id; + + if (unlikely((min_util < 0) || (min_util > SCHED_CAPACITY_SCALE))) { + pr_err("[FRAME_RTG]: %s invalid min_util value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + id = frame_info->rtg->id; + if (is_boost) { + frame_info->frame_boost_min_util = min_util; + trace_rtg_frame_sched(id, "FRAME_BOOST_MIN_UTIL", min_util); + } else { + frame_info->frame_min_util = min_util; + + frame_info->frame_util = calc_frame_util(frame_info, false); + trace_rtg_frame_sched(id, "frame_util", frame_info->frame_util); + sched_set_group_normalized_util(id, + frame_info->frame_util, RTG_FREQ_FORCE_UPDATE); + } + + return 0; +} + +int set_frame_max_util(struct frame_info *frame_info, int max_util) +{ + int id; + + if ((max_util < 0) || (max_util > SCHED_CAPACITY_SCALE)) { + pr_err("[FRAME_RTG]: %s invalid max_util value\n", + __func__); + return -EINVAL; + } + + if (!frame_info || !frame_info->rtg) + return -EINVAL; + + frame_info->frame_max_util = max_util; + id = frame_info->rtg->id; + trace_rtg_frame_sched(id, "FRAME_MAX_UTIL", frame_info->frame_max_util); + + return 0; +} + +struct frame_info *lookup_frame_info_by_grp_id(int grp_id) +{ + if (grp_id >= (MULTI_FRAME_ID + MULTI_FRAME_NUM) || (grp_id <= 0)) + return NULL; + if (grp_id >= MULTI_FRAME_ID) { + read_lock(&g_id_manager.lock); + if (!test_bit(grp_id - MULTI_FRAME_ID, g_id_manager.id_map)) { + read_unlock(&g_id_manager.lock); + return NULL; + } + read_unlock(&g_id_manager.lock); + return rtg_frame_info(grp_id); + } else + return rtg_frame_info(grp_id); +} + +static int _init_frame_info(struct frame_info *frame_info, int id) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + memset(frame_info, 0, sizeof(struct frame_info)); + mutex_init(&frame_info->lock); + + mutex_lock(&frame_info->lock); + frame_info->frame_rate = DEFAULT_FRAME_RATE; + frame_info->frame_time = div_u64(NSEC_PER_SEC, frame_info->frame_rate); + frame_info->thread_num = 0; + frame_info->prio = NOT_RT_PRIO; + atomic_set(&(frame_info->curr_rt_thread_num), 0); + atomic_set(&(frame_info->frame_sched_state), 0); + frame_info->vload_margin = DEFAULT_VLOAD_MARGIN; + frame_info->max_vload_time = + div_u64(frame_info->frame_time, NSEC_PER_MSEC) + + frame_info->vload_margin; + frame_info->frame_min_util = FRAME_DEFAULT_MIN_UTIL; + frame_info->frame_max_util = FRAME_DEFAULT_MAX_UTIL; + frame_info->prev_min_util = FRAME_DEFAULT_MIN_PREV_UTIL; + frame_info->prev_max_util = FRAME_DEFAULT_MAX_PREV_UTIL; + frame_info->margin_imme = false; + frame_info->timestamp_skipped = false; + frame_info->status = FRAME_END; + + grp = frame_rtg(id); + if (unlikely(!grp)) { + mutex_unlock(&frame_info->lock); + return -EINVAL; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + grp->private_data = frame_info; + grp->rtg_class = &frame_rtg_class; + raw_spin_unlock_irqrestore(&grp->lock, flags); + + frame_info->rtg = grp; + mutex_unlock(&frame_info->lock); + + return 0; +} + +static int __init init_frame_info(void) +{ + int ret = 0; + int id; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + if (ret != 0) + break; + ret = _init_frame_info(rtg_multi_frame_info(id), id); + } + + return ret; +} +late_initcall(init_frame_info); diff --git a/kernel/sched/rtg/frame_rtg.h b/kernel/sched/rtg/frame_rtg.h new file mode 100755 index 0000000000000000000000000000000000000000..01f23d27413a9fe856c9508df7f63ba946190c8f --- /dev/null +++ b/kernel/sched/rtg/frame_rtg.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Frame declaration + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __FRAME_RTG_H +#define __FRAME_RTG_H + +#include +#include +#include +#include + +#define MULTI_FRAME_ID (DEFAULT_CGROUP_COLOC_ID + 1) +#define MULTI_FRAME_NUM (MAX_NUM_CGROUP_COLOC_ID - DEFAULT_CGROUP_COLOC_ID - 1) + +#define NOT_RT_PRIO (-1) +#define STATIC_RTG_DEPTH (-1) + +#define FRAME_START (1 << 0) +#define FRAME_END (1 << 1) +#define FRAME_INVALID (1 << 2) +#define FRAME_USE_MARGIN_IMME (1 << 4) +#define FRAME_TIMESTAMP_SKIP_START (1 << 5) +#define FRAME_TIMESTAMP_SKIP_END (1 << 6) +#define FRAME_SETTIME (FRAME_START | FRAME_END | \ + FRAME_USE_MARGIN_IMME) +#define FRAME_SETTIME_PARAM (-1) + +#define DEFAULT_FRAME_RATE 60 +#define MIN_FRAME_RATE 1 +#define MAX_FRAME_RATE 120 + +/* MARGIN value : [-100, 100] */ +#define DEFAULT_VLOAD_MARGIN 16 +#define MIN_VLOAD_MARGIN (-100) +#define MAX_VLOAD_MARGIN 0xffff + +#define FRAME_MAX_VLOAD SCHED_CAPACITY_SCALE +#define FRAME_MAX_LOAD SCHED_CAPACITY_SCALE +#define FRAME_UTIL_INVALID_FACTOR 4 +#define FRAME_DEFAULT_MIN_UTIL 0 +#define FRAME_DEFAULT_MAX_UTIL SCHED_CAPACITY_SCALE +#define FRAME_DEFAULT_MIN_PREV_UTIL 0 +#define FRAME_DEFAULT_MAX_PREV_UTIL SCHED_CAPACITY_SCALE + +#define DEFAULT_MAX_RT_THREAD 5 +/* + * RTG_MAX_RT_THREAD_NUM should be CONFIG_NR_CPUS in previous version + * fit for FFRT here + */ +#define RTG_MAX_RT_THREAD_NUM 20 +#define INVALID_PREFERRED_CLUSTER 10 + +enum rtg_type { + VIP = 0, + TOP_TASK_KEY, + NORMAL_TASK, + RTG_TYPE_MAX, +}; + +struct frame_thread_info { + int prio; + int thread[MAX_TID_NUM]; + int thread_num; +}; + +struct multi_frame_id_manager { + DECLARE_BITMAP(id_map, MULTI_FRAME_NUM); + unsigned int offset; + rwlock_t lock; +}; + +struct rtg_info { + int rtg_num; + int rtgs[MULTI_FRAME_NUM]; +}; + +bool is_frame_rtg(int id); +int set_frame_rate(struct frame_info *frame_info, int rate); +int alloc_multi_frame_info(void); +struct frame_info *rtg_active_multi_frame_info(int id); +struct frame_info *rtg_multi_frame_info(int id); +void release_multi_frame_info(int id); +void clear_multi_frame_info(void); +void set_frame_prio(struct frame_info *frame_info, int prio); +struct task_struct *update_frame_thread(struct frame_info *frame_info, + int old_prio, int prio, int pid, + struct task_struct *old_task); +void update_frame_thread_info(struct frame_info *frame_info, + struct frame_thread_info *frame_thread_info); +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT +int read_rtg_rt_thread_num(void); +#else +static inline int read_rtg_rt_thread_num(void) +{ + return 0; +} +#endif +static inline +struct group_ravg *frame_info_rtg_load(const struct frame_info *frame_info) +{ + return &frame_info_rtg(frame_info)->ravg; +} +void set_frame_sched_state(struct frame_info *frame_info, bool enable); +int set_frame_margin(struct frame_info *frame_info, int margin); +int set_frame_timestamp(struct frame_info *frame_info, unsigned long timestamp); +int set_frame_max_util(struct frame_info *frame_info, int max_util); +int set_frame_min_util(struct frame_info *frame_info, int min_util, bool is_boost); +struct frame_info *lookup_frame_info_by_grp_id(int grp_id); +int list_rtg_group(struct rtg_info *rs_data); +int search_rtg(int pid); +#endif diff --git a/kernel/sched/rtg/rtg.c b/kernel/sched/rtg/rtg.c new file mode 100755 index 0000000000000000000000000000000000000000..168c6c3378b34e58689f9962009225fc8aa7d2d8 --- /dev/null +++ b/kernel/sched/rtg/rtg.c @@ -0,0 +1,1253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * related thread group sched + * + */ +#include +#include +#include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + +#include "../sched.h" +#include "rtg.h" +#include "../walt.h" + +#ifdef CONFIG_SCHED_RTG_FRAME +#include "frame_rtg.h" +#endif + +#define ADD_TASK 0 +#define REM_TASK 1 + +#define DEFAULT_GROUP_RATE 60 /* 60FPS */ +#define DEFAULT_UTIL_INVALID_INTERVAL (~0U) /* ns */ +#define DEFAULT_UTIL_UPDATE_TIMEOUT 20000000 /* ns */ +#define DEFAULT_FREQ_UPDATE_INTERVAL 8000000 /* ns */ + +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static DEFINE_RWLOCK(related_thread_group_lock); +static LIST_HEAD(active_related_thread_groups); + +#define for_each_related_thread_group(grp) \ + list_for_each_entry(grp, &active_related_thread_groups, list) + +void init_task_rtg(struct task_struct *p) +{ + rcu_assign_pointer(p->grp, NULL); + INIT_LIST_HEAD(&p->grp_list); +} + +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +struct related_thread_group * +lookup_related_thread_group(unsigned int group_id) +{ + return related_thread_groups[group_id]; +} + +int alloc_related_thread_groups(void) +{ + int i, ret; + struct related_thread_group *grp = NULL; + + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + grp->window_size = NSEC_PER_SEC / DEFAULT_GROUP_RATE; + grp->util_invalid_interval = DEFAULT_UTIL_INVALID_INTERVAL; + grp->util_update_timeout = DEFAULT_UTIL_UPDATE_TIMEOUT; + grp->max_boost = 0; + grp->freq_update_interval = DEFAULT_FREQ_UPDATE_INTERVAL; + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; + } + + return 0; + +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } + + return ret; +} + +/* + * Task's cpu usage is accounted in: + * rq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + int cpu = cpu_of(rq); + bool new_task; + int i; + + wallclock = sched_ktime_clock(); + + update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + new_task = is_new_task(p); + + cpu_time = &rq->grp_time; + if (event == ADD_TASK) { + migrate_type = RQ_TO_GROUP; + + src_curr_runnable_sum = &rq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &rq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; + *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[cpu]; + *src_nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + rq->window_start, new_task); + + } else { + migrate_type = GROUP_TO_RQ; + + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &rq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &rq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + p->ravg.curr_window_cpu[i] = 0; + p->ravg.prev_window_cpu[i] = 0; + } + } + + *dst_curr_runnable_sum += p->ravg.curr_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *dst_nt_curr_runnable_sum += p->ravg.curr_window; + *dst_nt_prev_runnable_sum += p->ravg.prev_window; + } + + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; + + trace_sched_migration_update_sum(p, migrate_type, rq); +} + +static void _set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id); +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq = NULL; + bool empty_group = true; + struct rq_flags flag; + unsigned long irqflag; + + rq = __task_rq_lock(p, &flag); + transfer_busy_time(rq, p->grp, p, REM_TASK); + + raw_spin_lock_irqsave(&grp->lock, irqflag); + list_del_init(&p->grp_list); + rcu_assign_pointer(p->grp, NULL); + + if (p->on_cpu) + grp->nr_running--; + + if ((int)grp->nr_running < 0) { + WARN_ON(1); + grp->nr_running = 0; + } + + if (!list_empty(&grp->tasks)) { + empty_group = false; + } else { +#ifdef CONFIG_UCLAMP_TASK + grp->max_boost = 0; +#endif + _set_preferred_cluster(grp, -1); + grp->ravg.normalized_util = 0; + } + + raw_spin_unlock_irqrestore(&grp->lock, irqflag); + __task_rq_unlock(rq, &flag); + + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + write_lock(&related_thread_group_lock); + list_del_init(&grp->list); + write_unlock(&related_thread_group_lock); + } +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq = NULL; + struct rq_flags flag; + unsigned long irqflag; +#ifdef CONFIG_UCLAMP_TASK + int boost; +#endif + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p, &flag); + transfer_busy_time(rq, grp, p, ADD_TASK); + + raw_spin_lock_irqsave(&grp->lock, irqflag); + list_add(&p->grp_list, &grp->tasks); + rcu_assign_pointer(p->grp, grp); + if (p->on_cpu) { + grp->nr_running++; + if (grp->nr_running == 1) + grp->mark_start = max(grp->mark_start, + sched_ktime_clock()); + } + +#ifdef CONFIG_UCLAMP_TASK + boost = (int)uclamp_eff_value(p, UCLAMP_MIN); + if (boost > grp->max_boost) + grp->max_boost = boost; +#endif + raw_spin_unlock_irqrestore(&grp->lock, irqflag); + __task_rq_unlock(rq, &flag); + + return 0; +} + +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0; + unsigned long flags; + struct related_thread_group *grp = NULL; + struct related_thread_group *old_grp = NULL; + + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + old_grp = p->grp; + if ((current != p && (p->flags & PF_EXITING)) || + (!old_grp && !group_id)) + goto done; + + /* + * If the system has CONFIG_SCHED_RTG_CGROUP, only tasks in DEFAULT group + * can be directly switched to other groups. + * + * In other cases, Switching from one group to another directly is not permitted. + */ + if (old_grp && group_id) { +#ifdef CONFIG_SCHED_RTG_CGROUP + if (old_grp->id == DEFAULT_CGROUP_COLOC_ID) { + remove_task_from_group(p); + } else { +#endif + rc = -EINVAL; + goto done; +#ifdef CONFIG_SCHED_RTG_CGROUP + } +#endif + } + + if (!group_id) { + remove_task_from_group(p); + goto done; + } + + grp = lookup_related_thread_group(group_id); + write_lock(&related_thread_group_lock); + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); + write_unlock(&related_thread_group_lock); + + rc = add_task_to_group(p, grp); +done: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return rc; +} + +/* group_id == 0: remove task from rtg */ +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct related_thread_group *grp = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +void update_group_nr_running(struct task_struct *p, int event, u64 wallclock) +{ + struct related_thread_group *grp; + bool need_update = false; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp) { + rcu_read_unlock(); + return; + } + + raw_spin_lock(&grp->lock); + + if (event == PICK_NEXT_TASK) + grp->nr_running++; + else if (event == PUT_PREV_TASK) + grp->nr_running--; + + if ((int)grp->nr_running < 0) { + WARN_ON(1); + grp->nr_running = 0; + } + + /* update preferred cluster if no update long */ + if (wallclock - grp->last_util_update_time > grp->util_update_timeout) + need_update = true; + + raw_spin_unlock(&grp->lock); + + rcu_read_unlock(); + + if (need_update && grp->rtg_class && grp->rtg_class->sched_update_rtg_tick && + grp->id != DEFAULT_CGROUP_COLOC_ID) + grp->rtg_class->sched_update_rtg_tick(grp); +} + +int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if (!window_size) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set window size for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + grp->window_size = window_size; + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +void group_time_rollover(struct group_ravg *ravg) +{ + ravg->prev_window_load = ravg->curr_window_load; + ravg->curr_window_load = 0; + ravg->prev_window_exec = ravg->curr_window_exec; + ravg->curr_window_exec = 0; +} + +int sched_set_group_window_rollover(unsigned int grp_id) +{ + struct related_thread_group *grp = NULL; + u64 wallclock; + unsigned long flag; +#ifdef CONFIG_UCLAMP_TASK + struct task_struct *p = NULL; + int boost; +#endif + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set window start for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + + wallclock = sched_ktime_clock(); + grp->prev_window_time = wallclock - grp->window_start; + grp->window_start = wallclock; + grp->max_boost = 0; + +#ifdef CONFIG_UCLAMP_TASK + list_for_each_entry(p, &grp->tasks, grp_list) { + boost = (int)uclamp_eff_value(p, UCLAMP_MIN); + if (boost > 0) + grp->max_boost = boost; + } +#endif + + group_time_rollover(&grp->ravg); + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +static void add_to_group_time(struct related_thread_group *grp, struct rq *rq, u64 wallclock) +{ + u64 delta_exec, delta_load; + u64 mark_start = grp->mark_start; + u64 window_start = grp->window_start; + + if (unlikely(wallclock <= mark_start)) + return; + + /* per group load tracking in RTG */ + if (likely(mark_start >= window_start)) { + /* + * ws ms wc + * | | | + * V V V + * |---------------| + */ + delta_exec = wallclock - mark_start; + grp->ravg.curr_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.curr_window_load += delta_load; + } else { + /* + * ms ws wc + * | | | + * V V V + * -----|---------- + */ + /* prev window statistic */ + delta_exec = window_start - mark_start; + grp->ravg.prev_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.prev_window_load += delta_load; + + /* curr window statistic */ + delta_exec = wallclock - window_start; + grp->ravg.curr_window_exec += delta_exec; + + delta_load = scale_exec_time(delta_exec, rq); + grp->ravg.curr_window_load += delta_load; + } +} + +static inline void add_to_group_demand(struct related_thread_group *grp, + struct rq *rq, u64 wallclock) +{ + if (unlikely(wallclock <= grp->window_start)) + return; + + add_to_group_time(grp, rq, wallclock); +} + +static int account_busy_for_group_demand(struct task_struct *p, int event) +{ + /* + *No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + if (event == TASK_WAKE || event == TASK_MIGRATE) + return 0; + + return 1; +} + +void update_group_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + struct related_thread_group *grp; + + if (!account_busy_for_group_demand(p, event)) + return; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp) { + rcu_read_unlock(); + return; + } + + raw_spin_lock(&grp->lock); + + if (grp->nr_running == 1) + grp->mark_start = max(grp->mark_start, p->ravg.mark_start); + + add_to_group_demand(grp, rq, wallclock); + + grp->mark_start = wallclock; + + raw_spin_unlock(&grp->lock); + + rcu_read_unlock(); +} + +void sched_update_rtg_tick(struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (!grp || list_empty(&grp->tasks)) { + rcu_read_unlock(); + return; + } + + if (grp->rtg_class && grp->rtg_class->sched_update_rtg_tick) + grp->rtg_class->sched_update_rtg_tick(grp); + + rcu_read_unlock(); +} + +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + int rc = 1; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (grp != NULL) + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + +unsigned int get_cluster_grp_running(int cluster_id) +{ + struct related_thread_group *grp = NULL; + unsigned int total_grp_running = 0; + unsigned long flag, rtg_flag; + unsigned int i; + + read_lock_irqsave(&related_thread_group_lock, rtg_flag); + + /* grp_id 0 is used for exited tasks */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (!grp) + continue; + + raw_spin_lock_irqsave(&grp->lock, flag); + if (grp->preferred_cluster != NULL && + grp->preferred_cluster->id == cluster_id) + total_grp_running += grp->nr_running; + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + read_unlock_irqrestore(&related_thread_group_lock, rtg_flag); + + return total_grp_running; +} + +static void _set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id) +{ + struct sched_cluster *cluster = NULL; + struct sched_cluster *cluster_found = NULL; + + if (sched_cluster_id == -1) { + grp->preferred_cluster = NULL; + return; + } + + for_each_sched_cluster_reverse(cluster) { + if (cluster->id == sched_cluster_id) { + cluster_found = cluster; + break; + } + } + + if (cluster_found != NULL) + grp->preferred_cluster = cluster_found; + else + pr_err("cannot found sched_cluster_id=%d\n", sched_cluster_id); +} + +/* + * sched_cluster_id == -1: grp will set to NULL + */ +static void set_preferred_cluster(struct related_thread_group *grp, + int sched_cluster_id) +{ + unsigned long flag; + + raw_spin_lock_irqsave(&grp->lock, flag); + _set_preferred_cluster(grp, sched_cluster_id); + raw_spin_unlock_irqrestore(&grp->lock, flag); +} + +int sched_set_group_preferred_cluster(unsigned int grp_id, int sched_cluster_id) +{ + struct related_thread_group *grp = NULL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set preferred cluster for group %d fail\n", grp_id); + return -ENODEV; + } + set_preferred_cluster(grp, sched_cluster_id); + + return 0; +} + +struct cpumask *find_rtg_target(struct task_struct *p) +{ + struct related_thread_group *grp = NULL; + struct sched_cluster *preferred_cluster = NULL; + struct cpumask *rtg_target = NULL; + + rcu_read_lock(); + grp = task_related_thread_group(p); + rcu_read_unlock(); + + if (!grp) + return NULL; + + preferred_cluster = grp->preferred_cluster; + if (!preferred_cluster) + return NULL; + + rtg_target = &preferred_cluster->cpus; + if (!task_fits_max(p, cpumask_first(rtg_target))) + return NULL; + + return rtg_target; +} + +int find_rtg_cpu(struct task_struct *p) +{ + int i; + cpumask_t search_cpus = CPU_MASK_NONE; + int max_spare_cap_cpu = -1; + unsigned long max_spare_cap = 0; + int idle_backup_cpu = -1; + struct cpumask *preferred_cpus = find_rtg_target(p); + + if (!preferred_cpus) + return -1; + + cpumask_and(&search_cpus, p->cpus_ptr, cpu_online_mask); +#ifdef CONFIG_CPU_ISOLATION_OPT + cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask); +#endif + + /* search the perferred idle cpu */ + for_each_cpu_and(i, &search_cpus, preferred_cpus) { + if (is_reserved(i)) + continue; + + if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) { + trace_find_rtg_cpu(p, preferred_cpus, "prefer_idle", i); + return i; + } + } + + for_each_cpu(i, &search_cpus) { + unsigned long spare_cap; + + if (sched_cpu_high_irqload(i)) + continue; + + if (is_reserved(i)) + continue; + + /* take the Active LB CPU as idle_backup_cpu */ + if (idle_cpu(i) || (i == task_cpu(p) && p->state == TASK_RUNNING)) { + /* find the idle_backup_cpu with max capacity */ + if (idle_backup_cpu == -1 || + capacity_orig_of(i) > capacity_orig_of(idle_backup_cpu)) + idle_backup_cpu = i; + + continue; + } + + spare_cap = capacity_spare_without(i, p); + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = i; + } + } + + if (idle_backup_cpu != -1) { + trace_find_rtg_cpu(p, preferred_cpus, "idle_backup", idle_backup_cpu); + return idle_backup_cpu; + } + + trace_find_rtg_cpu(p, preferred_cpus, "max_spare", max_spare_cap_cpu); + + return max_spare_cap_cpu; +} + +int sched_set_group_util_invalid_interval(unsigned int grp_id, + unsigned int interval) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if (interval == 0) + return -EINVAL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set invalid interval for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + if ((signed int)interval < 0) + grp->util_invalid_interval = DEFAULT_UTIL_INVALID_INTERVAL; + else + grp->util_invalid_interval = interval * NSEC_PER_MSEC; + + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +static inline bool +group_should_invalid_util(struct related_thread_group *grp, u64 now) +{ + if (grp->util_invalid_interval == DEFAULT_UTIL_INVALID_INTERVAL) + return false; + + return (now - grp->last_freq_update_time >= grp->util_invalid_interval); +} + +static inline bool valid_normalized_util(struct related_thread_group *grp) +{ + struct task_struct *p = NULL; + cpumask_t rtg_cpus = CPU_MASK_NONE; + bool valid = false; + + if (grp->nr_running != 0) { + list_for_each_entry(p, &grp->tasks, grp_list) { + get_task_struct(p); + if (p->state == TASK_RUNNING) + cpumask_set_cpu(task_cpu(p), &rtg_cpus); + trace_sched_rtg_task_each(grp->id, grp->nr_running, p); + put_task_struct(p); + } + + valid = cpumask_intersects(&rtg_cpus, + &grp->preferred_cluster->cpus); + } + trace_sched_rtg_valid_normalized_util(grp->id, grp->nr_running, &rtg_cpus, valid); + + return valid; +} + +void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq) +{ + struct related_thread_group *grp = NULL; + unsigned long max_grp_util = 0; + unsigned int max_grp_freq = 0; + u64 now = ktime_get_ns(); + unsigned long rtg_flag; + unsigned long flag; + + /* + * sum the prev_runnable_sum for each rtg, + * return the max rtg->load + */ + read_lock_irqsave(&related_thread_group_lock, rtg_flag); + if (list_empty(&active_related_thread_groups)) + goto unlock; + + for_each_related_thread_group(grp) { + raw_spin_lock_irqsave(&grp->lock, flag); + if (!list_empty(&grp->tasks) && + grp->preferred_cluster != NULL && + cpumask_intersects(query_cpus, + &grp->preferred_cluster->cpus) && + !group_should_invalid_util(grp, now)) { + + if (grp->ravg.normalized_util > max_grp_util) + max_grp_util = grp->ravg.normalized_util; + } + raw_spin_unlock_irqrestore(&grp->lock, flag); + } + +unlock: + read_unlock_irqrestore(&related_thread_group_lock, rtg_flag); + + *freq = max_grp_freq; + *util = max_grp_util; +} + +static struct sched_cluster *best_cluster(struct related_thread_group *grp) +{ + struct sched_cluster *cluster = NULL; + struct sched_cluster *max_cluster = NULL; + int cpu; + unsigned long util = grp->ravg.normalized_util; + unsigned long boosted_grp_util = util + grp->max_boost; + unsigned long max_cap = 0; + unsigned long cap = 0; + + /* find new cluster */ + for_each_sched_cluster(cluster) { + cpu = cpumask_first(&cluster->cpus); + cap = capacity_orig_of(cpu); + if (cap > max_cap) { + max_cap = cap; + max_cluster = cluster; + } + + if (boosted_grp_util <= cap) + return cluster; + } + + return max_cluster; +} + +static bool group_should_update_freq(struct related_thread_group *grp, + int cpu, unsigned int flags, u64 now) +{ + if (!grp) + return true; + + if (flags & RTG_FREQ_FORCE_UPDATE) { + return true; + } else if (flags & RTG_FREQ_NORMAL_UPDATE) { + if (now - grp->last_freq_update_time >= + grp->freq_update_interval) + return true; + } + + return false; +} + +int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag) +{ + struct related_thread_group *grp = NULL; + bool need_update_prev_freq = false; + bool need_update_next_freq = false; + u64 now; + unsigned long flags; + struct sched_cluster *preferred_cluster = NULL; + int prev_cpu; + int next_cpu; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set normalized util for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + + if (list_empty(&grp->tasks)) { + raw_spin_unlock_irqrestore(&grp->lock, flags); + return 0; + } + + grp->ravg.normalized_util = util; + + preferred_cluster = best_cluster(grp); + + /* update prev_cluster force when preferred_cluster changed */ + if (!grp->preferred_cluster) { + grp->preferred_cluster = preferred_cluster; + } else if (grp->preferred_cluster != preferred_cluster) { + prev_cpu = cpumask_first(&grp->preferred_cluster->cpus); + grp->preferred_cluster = preferred_cluster; + + need_update_prev_freq = true; + } + + if (grp->preferred_cluster != NULL) + next_cpu = cpumask_first(&grp->preferred_cluster->cpus); + else + next_cpu = 0; + + now = ktime_get_ns(); + grp->last_util_update_time = now; + need_update_next_freq = + group_should_update_freq(grp, next_cpu, flag, now); + if (need_update_next_freq) + grp->last_freq_update_time = now; + + raw_spin_unlock_irqrestore(&grp->lock, flags); + + if (need_update_prev_freq) + cpufreq_update_util(cpu_rq(prev_cpu), + SCHED_CPUFREQ_FORCE_UPDATE | SCHED_CPUFREQ_WALT); + + if (need_update_next_freq) + cpufreq_update_util(cpu_rq(next_cpu), + SCHED_CPUFREQ_FORCE_UPDATE | SCHED_CPUFREQ_WALT); + + return 0; +} + +int sched_set_group_freq_update_interval(unsigned int grp_id, unsigned int interval) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + if ((signed int)interval <= 0) + return -EINVAL; + + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (grp_id == DEFAULT_CGROUP_COLOC_ID || + grp_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + grp = lookup_related_thread_group(grp_id); + if (!grp) { + pr_err("set update interval for group %d fail\n", grp_id); + return -ENODEV; + } + + raw_spin_lock_irqsave(&grp->lock, flag); + grp->freq_update_interval = interval * NSEC_PER_MSEC; + raw_spin_unlock_irqrestore(&grp->lock, flag); + + return 0; +} + +#ifdef CONFIG_SCHED_RTG_CGROUP +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + struct task_group *tg; + bool colocate; + + rcu_read_lock(); + css = task_css(p, cpu_cgrp_id); + if (!css) { + rcu_read_unlock(); + return false; + } + tg = container_of(css, struct task_group, css); + colocate = tg->colocate; + rcu_read_unlock(); + + return colocate; +} +#else +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + +void add_new_task_to_grp(struct task_struct *new) +{ + struct related_thread_group *grp = NULL; + unsigned long flag; + + /* + * If the task does not belong to colocated schedtune + * cgroup, nothing to do. We are checking this without + * lock. Even if there is a race, it will be added + * to the co-located cgroup via cgroup attach. + */ + if (!uclamp_task_colocated(new)) + return; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flag); + + /* + * It's possible that someone already added the new task to the + * group. or it might have taken out from the colocated schedtune + * cgroup. check these conditions under lock. + */ + if (!uclamp_task_colocated(new) || new->grp) { + write_unlock_irqrestore(&related_thread_group_lock, flag); + return; + } + + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(new->grp, grp); + list_add(&new->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + write_unlock_irqrestore(&related_thread_group_lock, flag); +} + + +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + unsigned int old_grp_id; + + if (p) { + old_grp_id = sched_get_group_id(p); + /* + * If the task is already in a group which is not DEFAULT_CGROUP_COLOC_ID, + * we should not change the group id during switch to background. + */ + if ((old_grp_id != DEFAULT_CGROUP_COLOC_ID) && (grp_id == 0)) + return 0; + } + + return __sched_set_group_id(p, grp_id); +} +#endif /* CONFIG_SCHED_RTG_CGROUP */ + +#ifdef CONFIG_SCHED_RTG_DEBUG +#define seq_printf_rtg(m, x...) \ +do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ +} while (0) + +static void print_rtg_info(struct seq_file *file, + const struct related_thread_group *grp) +{ + seq_printf_rtg(file, "RTG_ID : %d\n", grp->id); + seq_printf_rtg(file, "RTG_INTERVAL : UPDATE:%lums#INVALID:%lums\n", + grp->freq_update_interval / NSEC_PER_MSEC, + grp->util_invalid_interval / NSEC_PER_MSEC); + seq_printf_rtg(file, "RTG_CLUSTER : %d\n", + grp->preferred_cluster ? grp->preferred_cluster->id : -1); +#ifdef CONFIG_SCHED_RTG_RT_THREAD_LIMIT + seq_printf_rtg(file, "RTG_RT_THREAD_NUM : %d/%d\n", + read_rtg_rt_thread_num(), RTG_MAX_RT_THREAD_NUM); +#endif +} + +static char rtg_task_state_to_char(const struct task_struct *tsk) +{ + static const char state_char[] = "RSDTtXZPI"; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return state_char[fls(state)]; +} + +static inline void print_rtg_task_header(struct seq_file *file, + const char *header, int run, int nr) +{ + seq_printf_rtg(file, + "%s : %d/%d\n" + "STATE COMM PID PRIO CPU\n" + "---------------------------------------------------------\n", + header, run, nr); +} + +static inline void print_rtg_task(struct seq_file *file, + const struct task_struct *tsk) +{ + seq_printf_rtg(file, "%5c %15s %5d %5d %5d(%*pbl)\n", + rtg_task_state_to_char(tsk), tsk->comm, tsk->pid, + tsk->prio, task_cpu(tsk), cpumask_pr_args(tsk->cpus_ptr)); +} + +static void print_rtg_threads(struct seq_file *file, + const struct related_thread_group *grp) +{ + struct task_struct *tsk = NULL; + int nr_thread = 0; + + list_for_each_entry(tsk, &grp->tasks, grp_list) + nr_thread++; + + if (!nr_thread) + return; + + print_rtg_task_header(file, "RTG_THREADS", + grp->nr_running, nr_thread); + list_for_each_entry(tsk, &grp->tasks, grp_list) { + if (unlikely(!tsk)) + continue; + get_task_struct(tsk); + print_rtg_task(file, tsk); + put_task_struct(tsk); + } + seq_printf_rtg(file, "---------------------------------------------------------\n"); +} + +static int sched_rtg_debug_show(struct seq_file *file, void *param) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + bool have_task = false; + + for_each_related_thread_group(grp) { + if (unlikely(!grp)) { + seq_printf_rtg(file, "RTG none\n"); + return 0; + } + + raw_spin_lock_irqsave(&grp->lock, flags); + if (list_empty(&grp->tasks)) { + raw_spin_unlock_irqrestore(&grp->lock, flags); + continue; + } + + if (!have_task) + have_task = true; + + seq_printf_rtg(file, "\n\n"); + print_rtg_info(file, grp); + print_rtg_threads(file, grp); + raw_spin_unlock_irqrestore(&grp->lock, flags); + } + + if (!have_task) + seq_printf_rtg(file, "RTG tasklist empty\n"); + + return 0; +} + +static int sched_rtg_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + return 0; +} + +static int sched_rtg_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_rtg_debug_show, NULL); +} + +static const struct proc_ops sched_rtg_debug_fops = { + .proc_open = sched_rtg_debug_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = sched_rtg_debug_release, +}; + +static int __init init_sched_rtg_debug_procfs(void) +{ + struct proc_dir_entry *pe = NULL; + + pe = proc_create("sched_rtg_debug", + 0400, NULL, &sched_rtg_debug_fops); + if (unlikely(!pe)) + return -ENOMEM; + return 0; +} +late_initcall(init_sched_rtg_debug_procfs); +#endif diff --git a/kernel/sched/rtg/rtg.h b/kernel/sched/rtg/rtg.h new file mode 100755 index 0000000000000000000000000000000000000000..4f0cedc332f094391a7d3690acde8fb6e268bbbd --- /dev/null +++ b/kernel/sched/rtg/rtg.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * related thread group sched header + */ +#ifndef __RTG_H +#define __RTG_H + +#include +#include + +#define for_each_sched_cluster_reverse(cluster) \ + list_for_each_entry_reverse(cluster, &cluster_head, list) + +#ifdef CONFIG_SCHED_RTG +void init_task_rtg(struct task_struct *p); +int alloc_related_thread_groups(void); +struct related_thread_group *lookup_related_thread_group(unsigned int group_id); +struct related_thread_group *task_related_thread_group(struct task_struct *p); +void update_group_nr_running(struct task_struct *p, int event, u64 wallclock); +struct rq; +void update_group_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock); +int sched_set_group_window_size(unsigned int grp_id, unsigned int window_size); +int sched_set_group_window_rollover(unsigned int grp_id); +struct group_cpu_time *group_update_cpu_time(struct rq *rq, + struct related_thread_group *grp); +void sched_update_rtg_tick(struct task_struct *p); +int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p); +int sched_set_group_preferred_cluster(unsigned int grp_id, int sched_cluster_id); +struct cpumask *find_rtg_target(struct task_struct *p); +int find_rtg_cpu(struct task_struct *p); +int sched_set_group_util_invalid_interval(unsigned int grp_id, + unsigned int interval); +int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag); +void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq); +int sched_set_group_freq_update_interval(unsigned int grp_id, + unsigned int interval); +#ifdef CONFIG_SCHED_RTG_CGROUP +int sync_cgroup_colocation(struct task_struct *p, bool insert); +void add_new_task_to_grp(struct task_struct *new); +#else +static inline void add_new_task_to_grp(struct task_struct *new) {} +#endif /* CONFIG_SCHED_RTG_CGROUP */ +#else +static inline int alloc_related_thread_groups(void) { return 0; } +static inline int sched_set_group_preferred_cluster(unsigned int grp_id, + int sched_cluster_id) +{ + return 0; +} +static inline int sched_set_group_normalized_util(unsigned int grp_id, unsigned long util, + unsigned int flag) +{ + return 0; +} +static inline void sched_get_max_group_util(const struct cpumask *query_cpus, + unsigned long *util, unsigned int *freq) +{ +} +static inline void add_new_task_to_grp(struct task_struct *new) {} +#endif /* CONFIG_SCHED_RTG */ +#endif diff --git a/kernel/sched/rtg/rtg_ctrl.c b/kernel/sched/rtg/rtg_ctrl.c new file mode 100755 index 0000000000000000000000000000000000000000..164f1b2373b9a2afbf190fe445bf500ab0e0a35b --- /dev/null +++ b/kernel/sched/rtg/rtg_ctrl.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * rtg control entry + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#include "rtg.h" +#include "rtg_ctrl.h" + +#include +#include + +#ifdef CONFIG_AUTHORITY_CTRL +#include +#endif + +#include +#include +#include + +atomic_t g_rtg_enable = ATOMIC_INIT(0); +static atomic_t g_rt_frame_num = ATOMIC_INIT(0); +static int g_frame_max_util = DEFAULT_MAX_UTIL; +static int g_max_rt_frames = DEFAULT_MAX_RT_FRAME; +typedef long (*rtg_ctrl_func)(int abi, void __user *arg); + +static long ctrl_set_enable(int abi, void __user *uarg); +static long ctrl_set_rtg(int abi, void __user *uarg); +static long ctrl_set_rtg_attr(int abi, void __user *uarg); +static long ctrl_begin_frame(int abi, void __user *uarg); +static long ctrl_end_frame(int abi, void __user *uarg); +static long ctrl_end_scene(int abi, void __user *uarg); +static long ctrl_set_min_util(int abi, void __user *uarg); +static long ctrl_set_margin(int abi, void __user *uarg); +static long ctrl_search_rtg(int abi, void __user *uarg); +static long ctrl_get_enable(int abi, void __user *uarg); + +static rtg_ctrl_func g_func_array[RTG_CTRL_MAX_NR] = { + NULL, /* reserved */ + ctrl_set_enable, // 1 + ctrl_set_rtg, + NULL, + ctrl_set_rtg_attr, + ctrl_begin_frame, // 5 + ctrl_end_frame, + ctrl_end_scene, + ctrl_set_min_util, + ctrl_set_margin, + NULL, + NULL, + ctrl_search_rtg, + ctrl_get_enable +}; + +static int init_proc_state(const int *config, int len); +static void deinit_proc_state(void); + +static int set_enable_config(char *config_str) +{ + char *p = NULL; + char *tmp = NULL; + int value; + int config[RTG_CONFIG_NUM]; + int i; + int ret = 0; + + for (i = 0; i < RTG_CONFIG_NUM; i++) + config[i] = INVALID_VALUE; + /* eg: key1:value1;key2:value2;key3:value3 */ + for (p = strsep(&config_str, ";"); p != NULL; + p = strsep(&config_str, ";")) { + tmp = strsep(&p, ":"); + if ((tmp == NULL) || (p == NULL)) + continue; + if (kstrtoint((const char *)p, DECIMAL, &value)) + return -INVALID_ARG; + + if (!strcmp(tmp, "sched_cycle")) + config[RTG_FREQ_CYCLE] = value; + else if (!strcmp(tmp, "frame_max_util")) + config[RTG_FRAME_MAX_UTIL] = value; + else if (!strcmp(tmp, "invalid_interval")) + config[RTG_INVALID_INTERVAL] = value; + else + continue; + } + + for (i = 0; i < RTG_CONFIG_NUM; i++) + pr_info("[SCHED_RTG] config[%d] = %d\n", i, config[i]); + + ret = init_proc_state(config, RTG_CONFIG_NUM); + + return ret; +} + +static void rtg_enable(int abi, const struct rtg_enable_data *data) +{ + char temp[MAX_DATA_LEN]; + int ret = -1; + + if (atomic_read(&g_rtg_enable) == 1) { + pr_info("[SCHED_RTG] already enabled!\n"); + return; + } + + if ((data->len <= 0) || (data->len >= MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] %s data len invalid\n", __func__); + return; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpointer-to-int-cast" + switch (abi) { + case IOCTL_ABI_ARM32: + ret = copy_from_user(&temp, + (void __user *)compat_ptr((compat_uptr_t)data->data), data->len); + break; + case IOCTL_ABI_AARCH64: + ret = copy_from_user(&temp, (void __user *)data->data, data->len); + break; + default: + pr_err("[SCHED_RTG] abi format error\n"); + break; + } + if (ret) { + pr_err("[SCHED_RTG] %s copy user data failed\n", __func__); + return; + } +#pragma GCC diagnostic pop + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + temp[data->len] = '\0'; + + if (set_enable_config(&temp) != SUCC) { + pr_err("[SCHED_RTG] %s failed!\n", __func__); + return; + } +#pragma GCC diagnostic pop + + atomic_set(&g_rtg_enable, 1); + pr_info("[SCHED_RTG] enabled!\n"); +} + +static void rtg_disable(void) +{ + if (atomic_read(&g_rtg_enable) == 0) { + pr_info("[SCHED_RTG] already disabled!\n"); + return; + } + pr_info("[SCHED_RTG] disabled!\n"); + atomic_set(&g_rtg_enable, 0); + deinit_proc_state(); +} + +static inline bool is_rt_type(int type) +{ + return (type >= VIP && type < NORMAL_TASK); +} + +static int do_update_rt_frame_num(struct frame_info *frame_info, int new_type) +{ + int old_type; + int ret = SUCC; + + mutex_lock(&frame_info->lock); + old_type = frame_info->prio - DEFAULT_RT_PRIO; + if (is_rt_type(new_type) == is_rt_type(old_type)) + goto out; + + if (is_rt_type(old_type)) { + if (atomic_read(&g_rt_frame_num) > 0) + atomic_dec(&g_rt_frame_num); + } else if (is_rt_type(new_type)) { + if (atomic_read(&g_rt_frame_num) < g_max_rt_frames) { + atomic_inc(&g_rt_frame_num); + } else { + pr_err("[SCHED_RTG]: %s g_max_rt_frames is %d\n", + __func__, g_max_rt_frames); + ret = -INVALID_ARG; + } + } +out: + mutex_unlock(&frame_info->lock); + + return ret; +} + +static int update_rt_frame_num(struct frame_info *frame_info, int new_type, int cmd) +{ + int ret = SUCC; + + switch (cmd) { + case UPDATE_RTG_FRAME: + ret = do_update_rt_frame_num(frame_info, new_type); + break; + case ADD_RTG_FRAME: + if (is_rt_type(new_type)) { + if (atomic_read(&g_rt_frame_num) >= g_max_rt_frames) { + pr_err("[SCHED_RTG] g_max_rt_frames is %d!\n", g_max_rt_frames); + ret = -INVALID_ARG; + } else { + atomic_inc(&g_rt_frame_num); + } + } + break; + case CLEAR_RTG_FRAME: + if ((atomic_read(&g_rt_frame_num) > 0) && is_rt_type(new_type)) + atomic_dec(&g_rt_frame_num); + break; + default: + return -INVALID_ARG; + } + trace_rtg_frame_sched(frame_info->rtg->id, "g_rt_frame_num", atomic_read(&g_rt_frame_num)); + trace_rtg_frame_sched(frame_info->rtg->id, "g_max_rt_frames", g_max_rt_frames); + + return ret; +} + +static long ctrl_set_enable(int abi, void __user *uarg) +{ + struct rtg_enable_data rs_enable; + + if (copy_from_user(&rs_enable, uarg, sizeof(rs_enable))) { + pr_err("[SCHED_RTG] CMD_ID_SET_ENABLE copy data failed\n"); + return -INVALID_ARG; + } + if (rs_enable.enable == 1) + rtg_enable(abi, &rs_enable); + else + rtg_disable(); + + return SUCC; +} + +static long ctrl_get_enable(int abi, void __user *uarg) +{ + return atomic_read(&g_rtg_enable); +} + +static inline bool is_valid_type(int type) +{ + return (type >= VIP && type < RTG_TYPE_MAX); +} + +static int parse_rtg_attr(const struct rtg_str_data *rs_data) +{ + char *p = NULL; + char *tmp = NULL; + char *data = NULL; + int value; + struct frame_info *frame_info = NULL; + int rate = -1; + int type = -1; + int ret; + + if (rs_data == NULL) { + pr_err("[SCHED_RTG] rtg attr: rs_data is null!\n"); + return -INVALID_ARG; + } + + data = rs_data->data; + if ((data == NULL) || (rs_data->len <= 0) || + (rs_data->len > MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] rtg attr: rs_data len err!\n"); + return -INVALID_ARG; + } + + // eg: rtgId:xx;rate:xx;type:xx; + for (p = strsep(&data, ";"); p != NULL; p = strsep(&data, ";")) { + tmp = strsep(&p, ":"); + if ((tmp == NULL) || (p == NULL)) + continue; + if (kstrtoint((const char *)p, DECIMAL, &value)) { + pr_err("[SCHED_RTG] rtg attr: rs_data format err!\n"); + return -INVALID_ARG; + } + if (!strcmp(tmp, "rtgId")) { + frame_info = rtg_frame_info(value); + } else if (!strcmp(tmp, "rate")) { + rate = value; + } else if (!strcmp(tmp, "type")) { + if (is_valid_type(value)) { + type = value; + } else { + pr_err("[SCHED_RTG] invalid type : %d\n", value); + return -INVALID_ARG; + } + } else { + pr_err("[SCHED_RTG] parse rtg attr failed!\n"); + return -INVALID_ARG; + } + } + + if (!frame_info) { + pr_err("[SCHED_RTG] rtg attr: invalid args!\n"); + return -INVALID_ARG; + } + + ret = set_frame_rate(frame_info, rate); + if (ret) + return ret; + + if (is_valid_type(type)) { + if (update_rt_frame_num(frame_info, type, UPDATE_RTG_FRAME)) { + pr_err("[SCHED_RTG] set rtg attr failed!\n"); + return -INVALID_ARG; + } + + set_frame_prio(frame_info, (type == NORMAL_TASK ? + NOT_RT_PRIO : (type + DEFAULT_RT_PRIO))); + } + + return SUCC; +} + +static long ctrl_set_rtg_attr(int abi, void __user *uarg) +{ + struct rtg_str_data rs; + char temp[MAX_DATA_LEN]; + int ret; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&rs, uarg, sizeof(rs))) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR copy data failed\n"); + return -INVALID_ARG; + } + if ((rs.len <= 0) || (rs.len >= MAX_DATA_LEN)) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR data len invalid\n"); + return -INVALID_ARG; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpointer-to-int-cast" + switch (abi) { + case IOCTL_ABI_ARM32: + ret = copy_from_user(&temp, + (void __user *)compat_ptr((compat_uptr_t)rs.data), rs.len); + break; + case IOCTL_ABI_AARCH64: + ret = copy_from_user(&temp, (void __user *)rs.data, rs.len); + break; + default: + pr_err("[SCHED_RTG] abi format error\n"); + return -INVALID_ARG; + } +#pragma GCC diagnostic pop + + if (ret) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG_ATTR copy rs.data failed with ret %d\n", ret); + return -INVALID_ARG; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + temp[rs.len] = '\0'; + rs.data = &temp; +#pragma GCC diagnostic pop + + return parse_rtg_attr(&rs); +} + +static void start_frame_freq(struct frame_info *frame_info) +{ + if (!frame_info) + return; + + if (atomic_read(&frame_info->start_frame_freq) == 0) { + atomic_set(&frame_info->start_frame_freq, 1); + set_frame_sched_state(frame_info, true); + } +} + +static int set_frame(struct frame_info *frame_info, int margin) +{ + int ret; + if (!frame_info) + return -INVALID_RTG_ID; + + atomic_set(&frame_info->frame_state, FRAME_DRAWING); + ret = set_frame_margin(frame_info, margin); + if (ret) + goto out; + + ret = set_frame_timestamp(frame_info, FRAME_START); + if (ret) + goto out; + +out: + return ret; +} + +static int reset_frame(struct frame_info *frame_info) +{ + if (!frame_info) + return -INVALID_RTG_ID; + + if (atomic_read(&frame_info->frame_state) == FRAME_END_STATE) { + pr_debug("[SCHED_RTG]: Frame state is already reset\n"); + return -INVALID_PROC_STATE; + } + + atomic_set(&frame_info->frame_state, FRAME_END_STATE); + return set_frame_timestamp(frame_info, FRAME_END); +} + +int update_frame_state(int grp_id, int margin, bool in_frame) +{ + int ret; + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(grp_id); + if (!frame_info || !frame_info->rtg) + return -INVALID_RTG_ID; + + if (in_frame) { + start_frame_freq(frame_info); + ret = set_frame(frame_info, margin); + trace_rtg_frame_sched(grp_id, "margin", margin); + } else { + ret = reset_frame(frame_info); + } + + return ret; +} + +static inline int curr_grp_id() +{ + return sched_get_group_id(current); +} + +static long ctrl_frame_state(void __user *uarg, bool is_enter) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_FRAME_FREQ copy data failed\n"); + return -INVALID_ARG; + } + + return update_frame_state(curr_grp_id(), state_data.state_param, is_enter); +} + +static long ctrl_begin_frame(int abi, void __user *uarg) +{ + return ctrl_frame_state(uarg, true); +} + +static long ctrl_end_frame(int abi, void __user *uarg) +{ + return ctrl_frame_state(uarg, false); +} + +static int stop_frame_freq(int gid) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(gid); + if (!frame_info) + return -INVALID_RTG_ID; + + atomic_set(&frame_info->start_frame_freq, 0); + set_frame_sched_state(frame_info, false); + + return 0; +} + +static long ctrl_end_scene(int abi, void __user *uarg) +{ + int rtg_id; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&rtg_id, uarg, sizeof(int))) { + pr_err("[SCHED_RTG] CMD_ID_END_SCENE copy data failed\n"); + return -INVALID_ARG; + } + + return stop_frame_freq(rtg_id); +} + +static int set_min_util(int gid, int min_util) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(gid); + if (!frame_info) + return -FRAME_ERR_PID; + + return set_frame_min_util(frame_info, min_util, false); +} + +static long ctrl_set_min_util(int abi, void __user *uarg) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_MIN_UTIL copy data failed\n"); + return -INVALID_ARG; + } + + return set_min_util(curr_grp_id(), state_data.state_param); +} + +static int set_margin(int grp_id, int margin) +{ + struct frame_info *frame_info = NULL; + + frame_info = lookup_frame_info_by_grp_id(grp_id); + if (!frame_info) + return -FRAME_ERR_PID; + + set_frame_margin(frame_info, margin); + + return SUCC; +} + +static long ctrl_set_margin(int abi, void __user *uarg) +{ + struct proc_state_data state_data; + + if (uarg == NULL) + return -INVALID_ARG; + + if (copy_from_user(&state_data, uarg, sizeof(state_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_MARGIN copy data failed\n"); + return -INVALID_ARG; + } + + return set_margin(curr_grp_id(), state_data.state_param); +} + +static void clear_rtg_frame_thread(struct frame_info *frame_info, bool reset) +{ + struct frame_thread_info frame_thread_info; + int i; + + if (!reset && frame_info) + frame_thread_info.prio = frame_info->prio; + else + frame_thread_info.prio = NOT_RT_PRIO; + for (i = 0; i < MAX_TID_NUM; i++) + frame_thread_info.thread[i] = -1; + frame_thread_info.thread_num = MAX_TID_NUM; + update_frame_thread_info(frame_info, &frame_thread_info); + if (reset) { + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); + atomic_set(&frame_info->frame_sched_state, 0); + trace_rtg_frame_sched(frame_info->rtg->id, "FRAME_SCHED_ENABLE", 0); + } +} + +static void copy_proc_from_rsdata(struct rtg_proc_data *proc_info, + const struct rtg_grp_data *rs_data) +{ + memset(proc_info, 0, sizeof(struct rtg_proc_data)); + proc_info->type = VIP; + proc_info->rtcnt = DEFAULT_MAX_RT_THREAD; + if ((rs_data->grp_type > 0) && (rs_data->grp_type < RTG_TYPE_MAX)) + proc_info->type = rs_data->grp_type; + if ((rs_data->rt_cnt > 0) && (rs_data->rt_cnt < DEFAULT_MAX_RT_THREAD)) + proc_info->rtcnt = rs_data->rt_cnt; +} + +static void init_frame_thread_info(struct frame_thread_info *frame_thread_info, + const struct rtg_proc_data *proc_info) +{ + int i; + int type = proc_info->type; + + frame_thread_info->prio = (type == NORMAL_TASK ? NOT_RT_PRIO : (type + DEFAULT_RT_PRIO)); + for (i = 0; i < MAX_TID_NUM; i++) + frame_thread_info->thread[i] = proc_info->thread[i]; + frame_thread_info->thread_num = MAX_TID_NUM; +} + +static int parse_create_rtg_grp(const struct rtg_grp_data *rs_data) +{ + struct rtg_proc_data proc_info; + struct frame_info *frame_info; + struct frame_thread_info frame_thread_info; + + copy_proc_from_rsdata(&proc_info, rs_data); + proc_info.rtgid = alloc_multi_frame_info(); + frame_info = rtg_frame_info(proc_info.rtgid); + if (!frame_info) { + pr_err("[SCHED_RTG] no free multi frame.\n"); + return -NO_FREE_MULTI_FRAME; + } + atomic_set(&frame_info->max_rt_thread_num, proc_info.rtcnt); + if (update_rt_frame_num(frame_info, rs_data->grp_type, ADD_RTG_FRAME)) { + release_multi_frame_info(proc_info.rtgid); + return -NO_RT_FRAME; + } + init_frame_thread_info(&frame_thread_info, &proc_info); + update_frame_thread_info(frame_info, &frame_thread_info); + atomic_set(&frame_info->frame_sched_state, 1); + pr_info("[SCHED_RTG] %s rtgid=%d, type=%d, prio=%d, threadnum=%d, rtnum=%d\n", + __func__, proc_info.rtgid, rs_data->grp_type, + frame_thread_info.prio, frame_thread_info.thread_num, proc_info.rtcnt); + + return proc_info.rtgid; +} + +static int parse_add_rtg_thread(const struct rtg_grp_data *rs_data) +{ + struct rtg_proc_data proc_info; + struct frame_info *frame_info; + int add_index; + int add_num; + int prio; + int fail_num = 0; + int i; + + if ((rs_data->grp_id <= 0) || (rs_data->grp_id >= MAX_NUM_CGROUP_COLOC_ID)) + return -INVALID_ARG; + copy_proc_from_rsdata(&proc_info, rs_data); + frame_info = lookup_frame_info_by_grp_id(rs_data->grp_id); + if (!frame_info) { + pr_err("[SCHED_RTG] grp not created yet.\n"); + return -INVALID_ARG; + } + mutex_lock(&frame_info->lock); + add_num = rs_data->tid_num; + if ((frame_info->thread_num < 0) || (add_num < 0)) { + mutex_unlock(&frame_info->lock); + pr_err("[SCHED_RTG] Unexception err: frame_info num < 0.\n"); + return -INVALID_RTG_ID; + } + if (frame_info->thread_num + add_num > MAX_TID_NUM) { + mutex_unlock(&frame_info->lock); + return -INVALID_RTG_ID; + } + add_index = frame_info->thread_num; + prio = (proc_info.type == NORMAL_TASK) ? NOT_RT_PRIO : frame_info->prio; + for (i = 0; i < add_num; i++) { + frame_info->thread[add_index] = update_frame_thread(frame_info, prio, prio, + rs_data->tids[i], + frame_info->thread[add_index]); + if (frame_info->thread[add_index]) { + atomic_set(&frame_info->thread_prio[add_index], prio); + frame_info->thread_num++; + add_index = frame_info->thread_num; + } else { + fail_num++; + } + } + mutex_unlock(&frame_info->lock); + + return fail_num; +} + +static int parse_remove_thread(const struct rtg_grp_data *rs_data) +{ + pr_err("[SCHED_RTG] frame rtg not support remove single yet.\n"); + + return -INVALID_ARG; +} + +static int do_clear_or_destroy_grp(const struct rtg_grp_data *rs_data, bool destroy) +{ + struct frame_info *frame_info; + int type; + int id = rs_data->grp_id; + + if (!is_frame_rtg(id)) { + pr_err("[SCHED_RTG] Failed to destroy rtg group %d!\n", id); + return -INVALID_ARG; + } + + frame_info = rtg_frame_info(id); + if (!frame_info) { + pr_err("[SCHED_RTG] Failed to destroy rtg group %d: grp not exist.\n", id); + return -INVALID_ARG; + } + + type = frame_info->prio - DEFAULT_RT_PRIO; + if (destroy) { + clear_rtg_frame_thread(frame_info, true); + release_multi_frame_info(id); + update_rt_frame_num(frame_info, type, CLEAR_RTG_FRAME); + } else { + clear_rtg_frame_thread(frame_info, false); + } + pr_info("[SCHED_RTG] %s clear frame(id=%d)\n", __func__, id); + + return SUCC; +} + +static int parse_destroy_grp(const struct rtg_grp_data *rs_data) +{ + return do_clear_or_destroy_grp(rs_data, true); +} + +long ctrl_set_rtg(int abi, void __user *uarg) +{ + struct rtg_grp_data rs_data; + long ret; + + if (copy_from_user(&rs_data, uarg, sizeof(rs_data))) { + pr_err("[SCHED_RTG] CMD_ID_SET_RTG copy data failed\n"); + return -INVALID_ARG; + } + + switch (rs_data.rtg_cmd) { + case CMD_CREATE_RTG_GRP: + ret = parse_create_rtg_grp(&rs_data); + break; + case CMD_ADD_RTG_THREAD: + ret = parse_add_rtg_thread(&rs_data); + break; + case CMD_REMOVE_RTG_THREAD: + ret = parse_remove_thread(&rs_data); + break; + case CMD_CLEAR_RTG_GRP: + ret = -INVALID_ARG; + break; + case CMD_DESTROY_RTG_GRP: + ret = parse_destroy_grp(&rs_data); + break; + default: + return -INVALID_ARG; + } + + return ret; +} + +static long ctrl_search_rtg(int abi, void __user *uarg) +{ + struct proc_state_data search_data; + + if (copy_from_user(&search_data, uarg, sizeof(search_data))) { + pr_err("[SCHED_RTG] CMD_ID_SEARCH_RTG copy data failed\n"); + return -INVALID_ARG; + } + + return search_rtg(search_data.state_param); +} + +static long do_proc_rtg_ioctl(int abi, struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *uarg = (void __user *)(uintptr_t)arg; + unsigned int func_id = _IOC_NR(cmd); +#ifdef CONFIG_RTG_AUTHORITY + bool authorized = true; +#endif + + if (uarg == NULL) { + pr_err("[SCHED_RTG] %s: invalid user uarg\n", __func__); + return -EINVAL; + } + + if (_IOC_TYPE(cmd) != RTG_SCHED_IPC_MAGIC) { + pr_err("[SCHED_RTG] %s: RTG_SCHED_IPC_MAGIC fail, TYPE=%d\n", + __func__, _IOC_TYPE(cmd)); + return -INVALID_MAGIC; + } + + if (!atomic_read(&g_rtg_enable) && (func_id != SET_ENABLE) && (func_id != GET_ENABLE)) { + pr_err("[SCHED_RTG] CMD_ID %x error: Rtg not enabled yet.\n", cmd); + return -RTG_DISABLED; + } + + if (func_id >= RTG_CTRL_MAX_NR) { + pr_err("[SCHED_RTG] %s: RTG_MAX_NR fail, _IOC_NR(cmd)=%d, MAX_NR=%d\n", + __func__, _IOC_NR(cmd), RTG_CTRL_MAX_NR); + return -INVALID_CMD; + } + +#ifdef CONFIG_RTG_AUTHORITY + authorized = check_authorized(func_id, RTG_AUTH_FLAG); + if (!authorized) { + pr_err("[SCHED_RTG] %s: uid not authorized.\n", __func__); + return -INVALID_CMD; + } +#endif + if (g_func_array[func_id] != NULL) + return (*g_func_array[func_id])(abi, uarg); + + return -EINVAL; +} + +static void reset_frame_info(struct frame_info *frame_info) +{ + int i; + clear_rtg_frame_thread(frame_info, true); + atomic_set(&frame_info->frame_state, -1); + atomic_set(&frame_info->curr_rt_thread_num, 0); + atomic_set(&frame_info->max_rt_thread_num, DEFAULT_MAX_RT_THREAD); + for (i = 0; i < MAX_TID_NUM; i++) + atomic_set(&frame_info->thread_prio[i], 0); +} + +static int do_init_proc_state(int rtgid, const int *config, int len) +{ + struct related_thread_group *grp = NULL; + struct frame_info *frame_info = NULL; + + grp = lookup_related_thread_group(rtgid); + if (unlikely(!grp)) + return -EINVAL; + + frame_info = (struct frame_info *)grp->private_data; + if (!frame_info) + return -EINVAL; + + reset_frame_info(frame_info); + + if ((config[RTG_FREQ_CYCLE] >= MIN_FREQ_CYCLE) && + (config[RTG_FREQ_CYCLE] <= MAX_FREQ_CYCLE)) + sched_set_group_freq_update_interval(rtgid, + (unsigned int)config[RTG_FREQ_CYCLE]); + else + sched_set_group_freq_update_interval(rtgid, + DEFAULT_FREQ_CYCLE); + + if (config[RTG_INVALID_INTERVAL] != INVALID_VALUE) + sched_set_group_util_invalid_interval(rtgid, + config[RTG_INVALID_INTERVAL]); + else + sched_set_group_util_invalid_interval(rtgid, + DEFAULT_INVALID_INTERVAL); + + set_frame_max_util(frame_info, g_frame_max_util); + + return SUCC; +} + +static int init_proc_state(const int *config, int len) +{ + int ret; + int id; + + if ((config == NULL) || (len != RTG_CONFIG_NUM)) + return -INVALID_ARG; + + if ((config[RTG_FRAME_MAX_UTIL] > 0) && + (config[RTG_FRAME_MAX_UTIL] < DEFAULT_MAX_UTIL)) + g_frame_max_util = config[RTG_FRAME_MAX_UTIL]; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + ret = do_init_proc_state(id, config, len); + if (ret) { + pr_err("[SCHED_RTG] init proc state for FRAME_ID=%d failed, ret=%d\n", + id, ret); + return ret; + } + } + atomic_set(&g_rt_frame_num, 0); + + return SUCC; +} + +static void deinit_proc_state(void) +{ + int id; + struct frame_info *frame_info = NULL; + struct related_thread_group *grp = NULL; + + for (id = MULTI_FRAME_ID; id < (MULTI_FRAME_ID + MULTI_FRAME_NUM); id++) { + grp = lookup_related_thread_group(id); + if (unlikely(!grp)) + return; + + frame_info = (struct frame_info *)grp->private_data; + if (frame_info) + reset_frame_info(frame_info); + } + clear_multi_frame_info(); + atomic_set(&g_rt_frame_num, 0); +} + +int proc_rtg_open(struct inode *inode, struct file *filp) +{ + return SUCC; +} + +static int proc_rtg_release(struct inode *inode, struct file *filp) +{ + return SUCC; +} + +long proc_rtg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return do_proc_rtg_ioctl(IOCTL_ABI_AARCH64, file, cmd, arg); +} + +#ifdef CONFIG_COMPAT +long proc_rtg_compat_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + return do_proc_rtg_ioctl(IOCTL_ABI_ARM32, file, cmd, + (unsigned long)(compat_ptr((compat_uptr_t)arg))); +} +#endif + +static const struct file_operations rtg_ctrl_fops = { + .open = proc_rtg_open, + .release = proc_rtg_release, + .unlocked_ioctl = proc_rtg_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_rtg_compat_ioctl, +#endif +}; + +static struct miscdevice rtg_ctrl_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sched_rtg_ctrl", + .fops = &rtg_ctrl_fops, + .mode = 0666, +}; + +static int __init rtg_ctrl_dev_init(void) +{ + return misc_register(&rtg_ctrl_device); +} + +static void __exit rtg_ctrl_dev_exit(void) +{ + misc_deregister(&rtg_ctrl_device); +} + +module_init(rtg_ctrl_dev_init); +module_exit(rtg_ctrl_dev_exit); diff --git a/kernel/sched/rtg/rtg_ctrl.h b/kernel/sched/rtg/rtg_ctrl.h new file mode 100755 index 0000000000000000000000000000000000000000..6fe3d9d399d9ec344c18f07c165613b231152b05 --- /dev/null +++ b/kernel/sched/rtg/rtg_ctrl.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * rtg control interface + * + * Copyright (c) 2022-2023 Huawei Technologies Co., Ltd. + */ + +#ifndef __RTG_CTL_H +#define __RTG_CTL_H + +#include +#include +#include + +#include "frame_rtg.h" + +/* set rtg */ +#define INVALID_VALUE 0xffff +#define DEFAULT_RT_PRIO 97 + +#define MAX_DATA_LEN 256 +#define DECIMAL 10 +#define DEFAULT_MAX_UTIL 1024 +#define MAX_SUBPROCESS_NUM 8 + +#define RTG_ID_INVALID (-1) +/* fit for FFRT, original DEFAULT_MAX_RT_FRAME is 3 */ +#define DEFAULT_MAX_RT_FRAME 10 +#define MAX_RT_THREAD (MAX_TID_NUM + 2) +#define INIT_VALUE (-1) +#define UPDATE_RTG_FRAME (1 << 0) +#define ADD_RTG_FRAME (1 << 1) +#define CLEAR_RTG_FRAME (1 << 2) + +#define DEFAULT_FREQ_CYCLE 4 +#define MIN_FREQ_CYCLE 1 +#define MAX_FREQ_CYCLE 16 +#define DEFAULT_INVALID_INTERVAL 50 + +/* proc_state */ +enum proc_state { + STATE_MIN = 0, + FRAME_DRAWING, + FRAME_RME_MAX = 19, + /* rme end */ + FRAME_END_STATE = FRAME_RME_MAX + 1, + + FRAME_CLICK = 100, + STATE_MAX, +}; + +enum rtg_config { + RTG_FREQ_CYCLE, + RTG_FRAME_MAX_UTIL, + RTG_INVALID_INTERVAL, + RTG_CONFIG_NUM, +}; + +enum rtg_err_no { + SUCC = 0, + RTG_DISABLED = 1, + INVALID_ARG, + INVALID_MAGIC, + INVALID_CMD, + FRAME_ERR_PID = 100, + NO_FREE_MULTI_FRAME, + NOT_MULTI_FRAME, + INVALID_RTG_ID, + NO_RT_FRAME, + INVALID_PROC_STATE, +}; + +struct rtg_grp_data { + int rtg_cmd; + int grp_id; + int grp_type; + int rt_cnt; + int tid_num; + int tids[MAX_TID_NUM]; +}; + +struct rtg_proc_data { + int rtgid; + int type; + int thread[MAX_TID_NUM]; + int rtcnt; +}; + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d6afa45f008a3ba4460c2dfbe2a9c9cc829ba5fa..afef39e60e9e0ea3abe0bb916e12dc825a073ccf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -101,6 +101,45 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sched_ravg_window; +extern unsigned int walt_cpu_util_freq_divisor; + +struct walt_sched_stats { + u64 cumulative_runnable_avg_scaled; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +#define NUM_TRACKED_WINDOWS 2 + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; +}; + +extern unsigned int sched_disable_window_stats; +#endif /* CONFIG_SCHED_WALT */ /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -657,6 +696,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + struct walt_sched_stats walt_stats; +#endif /* Locally cached copy of our task_group's idle value */ int idle; @@ -679,6 +721,9 @@ struct cfs_rq { #ifdef CONFIG_SMP struct list_head throttled_csd_list; #endif +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1116,6 +1161,27 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; +#ifdef CONFIG_SCHED_WALT + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + unsigned long walt_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; +#ifdef CONFIG_SCHED_RTG + struct group_cpu_time grp_time; +#endif +#endif /* CONFIG_SCHED_WALT */ #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; @@ -2321,6 +2387,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p); #endif +#ifdef CONFIG_SCHED_WALT + void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +#endif #ifdef CONFIG_SCHED_CORE int (*task_is_throttled)(struct task_struct *p, int cpu); @@ -2623,6 +2693,15 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifdef CONFIG_SCHED_WALT +u64 sched_ktime_clock(void); +#else +static inline u64 sched_ktime_clock(void) +{ + return sched_clock(); +} +#endif + #ifndef arch_scale_freq_tick static __always_inline void arch_scale_freq_tick(void) @@ -2703,6 +2782,11 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) extern void double_rq_lock(struct rq *rq1, struct rq *rq2); +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_disabled; +#endif + #ifdef CONFIG_PREEMPTION /* @@ -2993,11 +3077,20 @@ DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; + u64 clock; + +#ifdef CONFIG_SCHED_WALT + if (!(flags & SCHED_CPUFREQ_WALT)) + return; + clock = sched_ktime_clock(); +#else + clock = rq_clock(rq); +#endif data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, clock, flags); } #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} @@ -3563,4 +3656,249 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SCHED_WALT +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +extern struct list_head cluster_head; +extern struct sched_cluster *sched_cluster[NR_CPUS]; +unsigned long capacity_curr_of(int cpu); +unsigned long cpu_util_cfs(int cpu); + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +extern struct mutex policy_mutex; +extern unsigned int sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_capacity; +extern unsigned int min_capacity; +extern unsigned int max_load_scale_factor; +extern unsigned int max_possible_capacity; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int __read_mostly sched_init_task_load_windows; +extern unsigned int sysctl_sched_restrict_cluster_spill; +extern unsigned int sched_pred_alert_load; +extern struct sched_cluster init_cluster; + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + rq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)rq->cum_window_demand_scaled < 0)) + rq->cum_window_demand_scaled = 0; +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +extern void reset_task_stats(struct task_struct *p); + +#define CPU_RESERVED 1 +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_and_set_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int cpu_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->capacity; +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline int cpu_load_scale_factor(int cpu) +{ + return cpu_rq(cpu)->cluster->load_scale_factor; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return cluster->max_freq; +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static inline void __update_min_max_capacity(void) +{ + int i; + int max_cap = 0, min_cap = INT_MAX; + + for_each_possible_cpu(i) { + if (!cpu_active(i)) + continue; + + max_cap = max(max_cap, cpu_capacity(i)); + min_cap = min(min_cap, cpu_capacity(i)); + } + + max_capacity = max_cap; + min_capacity = min_cap; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static inline unsigned long +load_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cluster->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, + cluster_max_freq(cluster)); +} + +static inline int compute_load_scale_factor(struct sched_cluster *cluster) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cluster); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cluster); + load_scale >>= 10; + + return load_scale; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long +capacity_scale_cpu_efficiency(struct sched_cluster *cluster) +{ + return (1024 * cluster->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) +{ + return (1024 * cluster_max_freq(cluster)) / min_max_freq; +} + +static inline int compute_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cluster); + capacity >>= 10; + + return capacity; +} + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return cpu_max_possible_capacity(cpu); +} + +static inline unsigned long cpu_util_freq_walt(int cpu) +{ + u64 util; + struct rq *rq = cpu_rq(cpu); + unsigned long capacity = capacity_orig_of(cpu); + + if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util)) + return cpu_util_cfs(cpu); + + util = rq->prev_runnable_sum << SCHED_CAPACITY_SHIFT; + util = div_u64(util, sched_ravg_window); + + return (util >= capacity) ? capacity : util; +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} +#else /* CONFIG_SCHED_WALT */ +static inline void walt_fixup_cum_window_demand(struct rq *rq, + s64 scaled_delta) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline int is_reserved(int cpu) +{ + return 0; +} + +static inline void clear_reserved(int cpu) { } + +static inline bool hmp_capable(void) +{ + return false; +} +#endif /* CONFIG_SCHED_WALT */ + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c new file mode 100755 index 0000000000000000000000000000000000000000..d74579a1553db6f6545b0e02bcc7b21d9e5da312 --- /dev/null +++ b/kernel/sched/sched_avg.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved. + */ +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "sched.h" +#include "walt.h" +#include + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(unsigned long, iowait_prod_sum); +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define NR_THRESHOLD_PCT 15 + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(iowait_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @delta: Adjust nr by 'delta' amount + * @inc: Whether we are increasing or decreasing the count + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, long delta, bool inc) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = nr_running + (inc ? delta : -delta); + + BUG_ON((s64)per_cpu(nr, cpu) < 0); + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_last_busy_time(cpu, !inc, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} +EXPORT_SYMBOL(sched_update_nr_prod); + +/* + * Returns the CPU utilization % in the last window. + * + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + + raw_spin_lock_irqsave(&rq->lock, flags); + + util = rq->cfs.avg.util_avg; + capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = rq->prev_runnable_sum; + util = div64_u64(util, + sched_ravg_window >> SCHED_CAPACITY_SHIFT); + } +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d60545b7774222971f8f7baa678dac..84f00a3de3d0232360c3e95f25328cc053b0e289 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "walt.h" #ifdef CONFIG_SMP static int @@ -55,12 +56,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) @@ -138,4 +141,7 @@ DEFINE_SCHED_CLASS(stop) = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100755 index 0000000000000000000000000000000000000000..4391bf669ad49de314d601479a315991adbc8add --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1862 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * walt.c + * + * Window Assistant Load Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "sched.h" +#include "walt.h" +#include "core_ctl.h" +#include "rtg/rtg.h" +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + +const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", + "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", + "IRQ_UPDATE"}; +const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", + "RQ_TO_RQ", "GROUP_TO_GROUP"}; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 +#define SCHED_ACCOUNT_WAIT_TIME 1 + +static ktime_t ktime_last; +static bool sched_ktime_suspended; +DEFINE_MUTEX(cluster_lock); +static atomic64_t walt_irq_work_lastq_ws; +u64 walt_load_reported_window; + +static struct irq_work walt_cpufreq_irq_work; +static struct irq_work walt_migration_irq_work; + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +static int __init sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} +late_initcall(sched_init_ops); + +static void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + int level = 0; + + local_irq_save(*flags); + for_each_cpu(cpu, cpus) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->__lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->__lock, level); + level++; + } +} + +static void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + + for_each_cpu(cpu, cpus) + raw_spin_unlock(&cpu_rq(cpu)->__lock); + local_irq_restore(*flags); +} + +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW 20000000 +#endif + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled; + +__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* + * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy + * associated with them. This is required for atomic update of those variables + * when being modifed via sysctl interface. + * + * IMPORTANT: Initialize both copies to same value!! + */ + +__read_mostly unsigned int sched_ravg_hist_size = 5; +__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; + +__read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; +__read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; + +static __read_mostly unsigned int sched_io_is_busy = 1; + +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +unsigned int sysctl_sched_walt_init_task_load_pct = 15; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = (10 * NSEC_PER_MSEC); + +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + +/* + * A after-boot constant divisor for cpu_util_freq_walt() to apply the load + * boost. + */ +__read_mostly unsigned int walt_cpu_util_freq_divisor; + +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +unsigned int __read_mostly sched_init_task_load_windows_scaled; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + */ +unsigned int min_max_freq = 1; + +unsigned int max_capacity = 1024; /* max(rq->capacity) */ +unsigned int min_capacity = 1024; /* min(rq->capacity) */ +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +unsigned int +min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ + +/* Temporarily disable window-stats activity on all cpus */ +unsigned int __read_mostly sched_disable_window_stats; + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +static int __init set_sched_ravg_window(char *str) +{ + unsigned int window_size; + + get_option(&str, &window_size); + + if (window_size < MIN_SCHED_RAVG_WINDOW || + window_size > MAX_SCHED_RAVG_WINDOW) { + WARN_ON(1); + return -EINVAL; + } + + sched_ravg_window = window_size; + return 0; +} +early_param("sched_ravg_window", set_sched_ravg_window); + +__read_mostly unsigned int walt_scale_demand_divisor; +#define scale_demand(d) ((d)/walt_scale_demand_divisor) + +void inc_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_inc_cumulative_runnable_avg(rq, p); +} + +void dec_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + walt_dec_cumulative_runnable_avg(rq, p); +} + +void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) +{ + s64 task_load_delta = (s64)updated_demand_scaled - + p->ravg.demand_scaled; + + fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta); + + walt_fixup_cum_window_demand(rq, task_load_delta); +} + +static u64 +update_window_start(struct rq *rq, u64 wallclock, int event) +{ + s64 delta; + int nr_windows; + u64 old_window_start = rq->window_start; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < sched_ravg_window) + return old_window_start; + + nr_windows = div64_u64(delta, sched_ravg_window); + rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; + + rq->cum_window_demand_scaled = + rq->walt_stats.cumulative_runnable_avg_scaled; + + return old_window_start; +} + +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->__lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->__lock, flags); +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for exiting tasks + * or the idle task. + */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * The idle exit time is not accounted for the first task _picked_ up to + * run on the idle CPU. + */ + if (event == PICK_NEXT_TASK && rq->curr == rq->idle) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + bool new_task; +#ifdef CONFIG_SCHED_RTG + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + struct related_thread_group *grp; +#endif + + if (!p->on_rq && p->__state != TASK_WAKING) + return; + + if (exiting_task(p)) + return; + + if (p->__state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + if (sched_disable_window_stats) + goto done; + + wallclock = sched_ktime_clock(); + + update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->__state == TASK_WAKING && p->last_sleep_ts >= + src_rq->window_start) { + walt_fixup_cum_window_demand(src_rq, + -(s64)p->ravg.demand_scaled); + walt_fixup_cum_window_demand(dest_rq, p->ravg.demand_scaled); + } + + new_task = is_new_task(p); +#ifdef CONFIG_SCHED_RTG + /* Protected by rq_lock */ + grp = task_related_thread_group(p); + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ + if (grp) { + struct group_cpu_time *cpu_time; + + cpu_time = &src_rq->grp_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + cpu_time = &dest_rq->grp_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window; + *dst_nt_curr_runnable_sum += + p->ravg.curr_window; + } + } + + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= + p->ravg.prev_window; + *dst_nt_prev_runnable_sum += + p->ravg.prev_window; + } + } + } else { +#endif + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); +#ifdef CONFIG_SCHED_RTG + } +#endif + + if (!same_freq_domain(new_cpu, task_cpu(p))) + irq_work_queue(&walt_migration_irq_work); + +done: + if (p->__state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + + if (likely(rq->window_start)) + return; + + if (!sync_cpu_available) { + rq->window_start = 1; + sync_cpu_available = 1; + atomic64_set(&walt_irq_work_lastq_ws, rq->window_start); + walt_load_reported_window = + atomic64_read(&walt_irq_work_lastq_ws); + + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + raw_spin_unlock(&rq->__lock); + double_rq_lock(rq, sync_rq); + rq->window_start = sync_rq->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->__lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + u16 demand_scaled; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + demand_scaled = scale_demand(demand); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements walt stats + * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. + */ + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p) + && p->sched_class->fixup_walt_sched_stats) + p->sched_class->fixup_walt_sched_stats(rq, p, + demand_scaled); + else if (rq->curr == p) + walt_fixup_cum_window_demand(rq, demand_scaled); + } + + p->ravg.demand = demand; + p->ravg.demand_scaled = demand_scaled; + +done: + trace_sched_update_history(rq, p, runtime, samples, event); +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > sched_ravg_window)) + p->ravg.sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + +#ifdef CONFIG_SCHED_RTG + update_group_demand(p, rq, event, wallclock); +#endif + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, p->ravg.sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + u64 curr_sum = rq->curr_runnable_sum; + u64 nt_curr_sum = rq->nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + } + + rq->prev_runnable_sum = curr_sum; + rq->nt_prev_runnable_sum = nt_curr_sum; + + rq->curr_runnable_sum = 0; + rq->nt_curr_runnable_sum = 0; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = sched_ravg_window; + u64 delta; + u64 *curr_runnable_sum = &rq->curr_runnable_sum; + u64 *prev_runnable_sum = &rq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + bool new_task; + int cpu = rq->cpu; +#ifdef CONFIG_SCHED_RTG + struct group_cpu_time *cpu_time; + struct related_thread_group *grp; +#endif + + new_window = mark_start < window_start; + if (new_window) { + full_window = (window_start - mark_start) >= window_size; + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + new_task = is_new_task(p); + + /* + * Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. + */ + if (!is_idle_task(p) && !exiting_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + if (p_is_curr_task && new_window) + rollover_cpu_window(rq, full_window); + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + +#ifdef CONFIG_SCHED_RTG + grp = task_related_thread_group(p); + if (grp) { + cpu_time = &rq->grp_time; + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } +#endif + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p) && !exiting_task(p)) { + p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * window then that is all that need be accounted. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + return; +} + +static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq) +{ + u64 result; + + if (old_window_start == rq->window_start) + return; + + result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, + rq->window_start); + if (result == old_window_start) + irq_work_queue(&walt_cpufreq_irq_work); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 old_window_start; + + if (!rq->window_start || sched_disable_window_stats || + p->ravg.mark_start == wallclock) + return; + + lockdep_assert_held(&rq->__lock); + + old_window_start = update_window_start(rq, wallclock, event); + +#ifdef CONFIG_SCHED_RTG + update_group_nr_running(p, event, wallclock); +#endif + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + + if (exiting_task(p)) + goto done; + + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime); +done: + p->ravg.mark_start = wallclock; + + run_walt_irq_work(old_window_start, rq); +} + +int sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec(table, write, buffer, length, ppos); + if (rc) + return rc; + + sysctl_sched_init_task_load_pct = sysctl_sched_walt_init_task_load_pct; + + return 0; +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + return p->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + p->init_load_pct = init_load_pct; + + return 0; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; + u32 init_load_pct = current->init_load_pct; + +#ifdef CONFIG_SCHED_RTG + init_task_rtg(p); +#endif + + p->last_sleep_ts = 0; + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), + GFP_KERNEL | __GFP_NOFAIL); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + init_load_windows_scaled = scale_demand(init_load_windows); + } + + p->ravg.demand = init_load_windows; + p->ravg.demand_scaled = init_load_windows_scaled; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} + +void free_task_load_ptrs(struct task_struct *p) +{ + kfree(p->ravg.curr_window_cpu); + kfree(p->ravg.prev_window_cpu); + + /* + * update_task_ravg() can be called for exiting tasks. While the + * function itself ensures correct behavior, the corresponding + * trace event requires that these pointers be NULL. + */ + p->ravg.curr_window_cpu = NULL; + p->ravg.prev_window_cpu = NULL; +} + +void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; + + if (exiting_task(p)) { + sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } + + memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start || sched_disable_window_stats) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +unsigned int max_possible_efficiency = 1; +unsigned int min_possible_efficiency = UINT_MAX; +unsigned int max_power_cost = 1; + +static cpumask_t all_cluster_cpus = CPU_MASK_NONE; +DECLARE_BITMAP(all_cluster_ids, NR_CPUS); +struct sched_cluster *sched_cluster[NR_CPUS]; +int num_clusters; + +struct list_head cluster_head; + +static void +insert_cluster(struct sched_cluster *cluster, struct list_head *head) +{ + struct sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (cluster->max_power_cost < tmp->max_power_cost) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); + if (!cluster) { + pr_warn("Cluster allocation failed. Possible bad scheduling\n"); + return NULL; + } + + INIT_LIST_HEAD(&cluster->list); + cluster->max_power_cost = 1; + cluster->min_power_cost = 1; + cluster->capacity = 1024; + cluster->max_possible_capacity = 1024; + cluster->efficiency = 1; + cluster->load_scale_factor = 1024; + cluster->cur_freq = 1; + cluster->max_freq = 1; + cluster->min_freq = 1; + cluster->max_possible_freq = 1; + cluster->freq_init_done = false; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + cluster->efficiency = topology_get_cpu_scale(cpumask_first(cpus)); + + if (cluster->efficiency > max_possible_efficiency) + max_possible_efficiency = cluster->efficiency; + if (cluster->efficiency < min_possible_efficiency) + min_possible_efficiency = cluster->efficiency; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + + if (!cluster) + return; + + for_each_cpu(i, cpus) + cpu_rq(i)->cluster = cluster; + + insert_cluster(cluster, head); + set_bit(num_clusters, all_cluster_ids); + num_clusters++; +} + +static int compute_max_possible_capacity(struct sched_cluster *cluster) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cluster); + capacity >>= 10; + + capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; + capacity >>= 10; + + return capacity; +} + +void walt_update_min_max_capacity(void) +{ + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int +compare_clusters(void *priv, const struct list_head *a, const struct list_head *b) +{ + struct sched_cluster *cluster1, *cluster2; + int ret; + + cluster1 = container_of(a, struct sched_cluster, list); + cluster2 = container_of(b, struct sched_cluster, list); + + /* + * Don't assume higher capacity means higher power. If the + * power cost is same, sort the higher capacity cluster before + * the lower capacity cluster to start placing the tasks + * on the higher capacity cluster. + */ + ret = cluster1->max_power_cost > cluster2->max_power_cost || + (cluster1->max_power_cost == cluster2->max_power_cost && + cluster1->max_possible_capacity < + cluster2->max_possible_capacity); + + return ret; +} + +void sort_clusters(void) +{ + struct sched_cluster *cluster; + struct list_head new_head; + unsigned int tmp_max = 1; + + INIT_LIST_HEAD(&new_head); + + for_each_sched_cluster(cluster) { + cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), + max_task_load()); + cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), + 0); + + if (cluster->max_power_cost > tmp_max) + tmp_max = cluster->max_power_cost; + } + max_power_cost = tmp_max; + + move_list(&new_head, &cluster_head, true); + + list_sort(NULL, &new_head, compare_clusters); + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); +} + +static void update_all_clusters_stats(void) +{ + struct sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + unsigned long flags; + + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_sched_cluster(cluster) { + u64 mpc; + + cluster->capacity = compute_capacity(cluster); + mpc = cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + + cluster->exec_scale_factor = + DIV_ROUND_UP(cluster->efficiency * 1024, + max_possible_efficiency); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + + __update_min_max_capacity(); + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +void update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + const struct cpumask *cluster_cpus; + struct list_head new_head; + int i; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cluster_cpus = cpu_coregroup_mask(i); + cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); + cpumask_andnot(&cpus, &cpus, cluster_cpus); + add_cluster(cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); +} + +struct sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .max_power_cost = 1, + .min_power_cost = 1, + .capacity = 1024, + .max_possible_capacity = 1024, + .efficiency = 1, + .load_scale_factor = 1024, + .cur_freq = 1, + .max_freq = 1, + .min_freq = 1, + .max_possible_freq = 1, + .exec_scale_factor = 1024, +}; + +void init_clusters(void) +{ + bitmap_clear(all_cluster_ids, 0, NR_CPUS); + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); +} + +static unsigned long cpu_max_table_freq[NR_CPUS]; + +void update_cpu_cluster_capacity(const cpumask_t *cpus) +{ + int i; + struct sched_cluster *cluster; + struct cpumask cpumask; + unsigned long flags; + + cpumask_copy(&cpumask, cpus); + acquire_rq_locks_irqsave(cpu_possible_mask, &flags); + + for_each_cpu(i, &cpumask) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); + + cluster->capacity = compute_capacity(cluster); + cluster->load_scale_factor = compute_load_scale_factor(cluster); + } + + __update_min_max_capacity(); + + release_rq_locks_irqrestore(cpu_possible_mask, &flags); +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + struct sched_cluster *cluster = NULL; + struct cpumask policy_cluster = *policy->related_cpus; + unsigned int orig_max_freq = 0; + int i, j, update_capacity = 0; + + if (val != CPUFREQ_CREATE_POLICY) + return 0; + + walt_update_min_max_capacity(); + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + for_each_cpu(i, &policy_cluster) + cpu_max_table_freq[i] = policy->cpuinfo.max_freq; + + for_each_cpu(i, &policy_cluster) { + cluster = cpu_rq(i)->cluster; + cpumask_andnot(&policy_cluster, &policy_cluster, + &cluster->cpus); + + orig_max_freq = cluster->max_freq; + cluster->min_freq = policy->min; + cluster->max_freq = policy->max; + cluster->cur_freq = policy->cur; + + if (!cluster->freq_init_done) { + mutex_lock(&cluster_lock); + for_each_cpu(j, &cluster->cpus) + cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, + policy->related_cpus); + cluster->max_possible_freq = policy->cpuinfo.max_freq; + cluster->max_possible_capacity = + compute_max_possible_capacity(cluster); + cluster->freq_init_done = true; + + sort_clusters(); + update_all_clusters_stats(); + mutex_unlock(&cluster_lock); + continue; + } + + update_capacity += (orig_max_freq != cluster->max_freq); + } + + if (update_capacity) + update_cpu_cluster_capacity(policy->related_cpus); + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->policy->cpu, new_freq = freq->new; + unsigned long flags; + struct sched_cluster *cluster; + struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; + int i, j; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_DONE; + + if (cpu_cur_freq(cpu) == new_freq) + return NOTIFY_OK; + + for_each_cpu(i, &policy_cpus) { + cluster = cpu_rq(i)->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->__lock, flags); + update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->__lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_walt_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_walt_callback); + +/* + * Runs in hard-irq context. This should ideally run just after the latest + * window roll-over. + */ +void walt_irq_work(struct irq_work *irq_work) +{ + struct sched_cluster *cluster; + struct rq *rq; + int cpu; + u64 wc; + bool is_migration = false; + int level = 0; + + /* Am I the window rollover work or the migration work? */ + if (irq_work == &walt_migration_irq_work) + is_migration = true; + + for_each_cpu(cpu, cpu_possible_mask) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->__lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->__lock, level); + level++; + } + + wc = sched_ktime_clock(); + walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); + for_each_sched_cluster(cluster) { + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(cpu, &cluster->cpus) { + rq = cpu_rq(cpu); + if (rq->curr) { + update_task_ravg(rq->curr, rq, + TASK_UPDATE, wc, 0); + account_load_subtractions(rq); + } + } + + raw_spin_unlock(&cluster->load_lock); + } + + for_each_sched_cluster(cluster) { + cpumask_t cluster_online_cpus; + unsigned int num_cpus, i = 1; + + cpumask_and(&cluster_online_cpus, &cluster->cpus, + cpu_online_mask); + num_cpus = cpumask_weight(&cluster_online_cpus); + for_each_cpu(cpu, &cluster_online_cpus) { + int flag = SCHED_CPUFREQ_WALT; + + rq = cpu_rq(cpu); + + if (i == num_cpus) + cpufreq_update_util(cpu_rq(cpu), flag); + else + cpufreq_update_util(cpu_rq(cpu), flag | + SCHED_CPUFREQ_CONTINUE); + i++; + } + } + + for_each_cpu(cpu, cpu_possible_mask) + raw_spin_unlock(&cpu_rq(cpu)->__lock); + + if (!is_migration) + core_ctl_check(this_rq()->window_start); +} + +static void walt_init_once(void) +{ + init_irq_work(&walt_migration_irq_work, walt_irq_work); + init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); + + walt_cpu_util_freq_divisor = + (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100; + walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + sched_init_task_load_windows_scaled = + scale_demand(sched_init_task_load_windows); +} + +void walt_sched_init_rq(struct rq *rq) +{ + static bool init; + int j; + + if (!init) { + walt_init_once(); + init = true; + } + + cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask); + + rq->walt_stats.cumulative_runnable_avg_scaled = 0; + rq->window_start = 0; + rq->walt_flags = 0; + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for rq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + rq->cluster = &init_cluster; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + rq->cum_window_demand_scaled = 0; + + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) + memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions)); +} + +#define min_cap_cluster() \ + list_first_entry(&cluster_head, struct sched_cluster, list) +#define max_cap_cluster() \ + list_last_entry(&cluster_head, struct sched_cluster, list) +static int sched_cluster_debug_show(struct seq_file *file, void *param) +{ + struct sched_cluster *cluster = NULL; + + seq_printf(file, "min_id:%d, max_id:%d\n", + min_cap_cluster()->id, + max_cap_cluster()->id); + + for_each_sched_cluster(cluster) { + seq_printf(file, "id:%d, cpumask:%d(%*pbl)\n", + cluster->id, + cpumask_first(&cluster->cpus), + cpumask_pr_args(&cluster->cpus)); + } + + return 0; +} + +static int sched_cluster_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_cluster_debug_show, NULL); +} + +static const struct proc_ops sched_cluster_fops = { + .proc_open = sched_cluster_debug_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init init_sched_cluster_debug_procfs(void) +{ + struct proc_dir_entry *pe = NULL; + + pe = proc_create("sched_cluster", + 0444, NULL, &sched_cluster_fops); + if (!pe) + return -ENOMEM; + return 0; +} +late_initcall(init_sched_cluster_debug_procfs); diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100755 index 0000000000000000000000000000000000000000..c5d6e241034da47010aa756adc7ec67e7c2cd0f4 --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * walt.h + * + * head file for Window-Assistant-Load-Tracking + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +#include + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +#define SCHED_NEW_TASK_WINDOWS 5 + +extern unsigned int sched_ravg_window; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +unsigned long capacity_curr_of(int cpu); + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static inline struct sched_cluster *cpu_cluster(int cpu) +{ + return cpu_rq(cpu)->cluster; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); + + delta = (delta * capcurr) >> SCHED_CAPACITY_SHIFT; + + return delta; +} + +static inline bool is_new_task(struct task_struct *p) +{ + return p->ravg.active_windows < SCHED_NEW_TASK_WINDOWS; +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +static inline void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +extern void reset_task_stats(struct task_struct *p); +extern void update_cluster_topology(void); +extern void init_clusters(void); +extern void update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); +} + +static inline void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled); + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled); +} + +static inline void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, + -(s64)p->ravg.demand_scaled); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->__state == TASK_RUNNING) + walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled); +} +extern void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled); +extern void inc_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void dec_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void fixup_busy_time(struct task_struct *p, int new_cpu); +extern void init_new_task_load(struct task_struct *p); +extern void mark_task_starting(struct task_struct *p); +extern void set_window_start(struct rq *rq); +void account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); + +void walt_irq_work(struct irq_work *irq_work); + +void walt_sched_init_rq(struct rq *rq); + +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +extern int +sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); + +static inline unsigned int cpu_cur_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->cur_freq; +} + +static inline void assign_cluster_ids(struct list_head *head) +{ + struct sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } +} + +extern void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task); +#else /* CONFIG_SCHED_WALT */ +static inline void walt_sched_init_rq(struct rq *rq) { } + +static inline void update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } + +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) { } + +static inline void +inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void init_new_task_load(struct task_struct *p) { } +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline void update_cluster_topology(void) { } +static inline void init_clusters(void) { } + +static inline void +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled) { } + +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } + +static inline u64 sched_irqload(int cpu) +{ + return 0; +} +static inline int sched_cpu_high_irqload(int cpu) +{ + return 0; +} +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } +#endif /* CONFIG_SCHED_WALT */ + +#endif /* __WALT_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f526ad6688168443913385eda101fa1..b0151dbbd162a758d07a016373bbf9c274831a24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1623,6 +1623,40 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_walt_init_task_load_pct_sysctl_handler, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted",