diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 35dd6141f057ad6e162b8bdce2a533b8ba0ae2ca..7330eac7b52cc444354c823654cb48ceae86db31 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -88,6 +88,7 @@ CONFIG_BPF_JIT_DEFAULT_ON=y # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_BPF_PRELOAD is not set # CONFIG_BPF_LSM is not set +CONFIG_BPF_SCHED=y # end of BPF subsystem CONFIG_PREEMPT_NONE_BUILD=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index dfefc129ed0eb7a77a3dabecd2112128185d9333..a4bd07549bf12eef8149df917b8e05ecde529288 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -106,6 +106,7 @@ CONFIG_BPF_JIT_DEFAULT_ON=y # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_BPF_PRELOAD is not set # CONFIG_BPF_LSM is not set +CONFIG_BPF_SCHED=y # end of BPF subsystem CONFIG_PREEMPT_BUILD=y diff --git a/fs/proc/base.c b/fs/proc/base.c index e04b0126334f991775223b2b1149f9447ac712f2..6f88566a4d79fcf2c4d8f8a764fae486adc6474b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3672,6 +3672,64 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, }; +#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + int err = 0; + long tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + if (unlikely(tsk->pid == 1)) { + err = -EPERM; + goto out; + } + + err = kstrtol_from_user(buf, count, 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3781,6 +3839,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd2493d2787ce9ea689d9c5d50d539453eb76f0 --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include + +#ifdef CONFIG_BPF_SCHED + +#include + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline bool bpf_sched_enabled(void) +{ + return false; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c68760b37872634fa3a0c0a0870c6066..dd79463eea4e31989cb816009f5d8ca039c75827 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, struct bpf_nf_ctx, struct bpf_nf_ctx) #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index bd9031f5772c49d1f94ab471e5a6709acbdbe800..61136dc7c9b4596ac5e9e746d1b8c02855671186 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1547,6 +1547,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_BPF_SCHED + long tag; +#endif + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_t *prefer_cpus; const cpumask_t *select_cpus; @@ -2471,7 +2475,29 @@ void rseq_syscall(struct pt_regs *regs); static inline void rseq_syscall(struct pt_regs *regs) { } +#endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_migrate_node { + int src_cpu; + int src_node; + int dst_cpu; + int dst_node; +}; #endif #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..c43297cc60498abd04f43433533c7ba1a61f4465 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, + struct sched_migrate_node *migrate_node) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..9dd0b85549b6947e68ce1b4002f341ba73ce75c0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -1040,6 +1041,7 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE }; diff --git a/init/init_task.c b/init/init_task.c index ac0c5850f74bb4c3d569c120999c81c9231decff..2101c6e3432d23d2db222aee7f510d5c9a820ae1 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -213,6 +213,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 6a906ff930065268dbdf4e5c4ea0bb3f851bcb2f..19f6ab882ab123c07612db3901da9590fcc320cc 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -100,4 +100,17 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +config BPF_SCHED + bool "Sched Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on BPF_JIT + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling policies. When CONFIG_BPF_SCHED + is enabled, privileged BPF could be used to expand scheduling + capabilities. + + If you are unsure how to answer this question, answer N. + endmenu # "BPF subsystem" diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8090d7fb11ef686019b4734bbc7151057ee78efa..b8aa9e66e3561c77646070ceb6b11c3c890a9cc5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5982,6 +5982,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; +#ifdef CONFIG_BPF_SCHED + case BPF_SCHED: +#endif case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d77b2f8b93641b4445645ec2d9397849fe6a5af3..d47ae625eb9128806b31747f8ed88dea996e2a99 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -2412,6 +2413,9 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: +#endif break; default: return -EINVAL; @@ -2539,6 +2543,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: +#endif return true; default: return false; @@ -3025,6 +3032,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); +#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); @@ -3115,6 +3127,14 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; +#endif default: err = -EINVAL; goto out_put_prog; @@ -3234,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; } +#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr; @@ -3582,6 +3607,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: +#endif if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -3717,6 +3745,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return BPF_PROG_TYPE_SCHED_CLS; +#ifdef CONFIG_BPF_SCHED + case BPF_SCHED: + return BPF_PROG_TYPE_SCHED; +#endif default: return BPF_PROG_TYPE_UNSPEC; } @@ -3744,6 +3776,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, -EINVAL : 0; case BPF_PROG_TYPE_EXT: return 0; +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +#endif case BPF_PROG_TYPE_NETFILTER: if (attach_type != BPF_NETFILTER) return -EINVAL; @@ -4922,6 +4960,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_EXT: +#ifdef CONFIG_BPF_SCHED + case BPF_PROG_TYPE_SCHED: +#endif ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e97aeda3a86b55522a73e53279543cb5b4df6919..e5b97eb226e847af1faa2618f163cc8ec47ab4f0 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -493,6 +493,9 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; +#ifdef CONFIG_BPF_SCHED + case BPF_SCHED: +#endif case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 824531d4c262a3409dffd3138e86776e8a770ebf..d3090e8778e10ceea50d54ef203d822bb2c9d4f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,6 +28,10 @@ #include #include +#ifdef CONFIG_BPF_SCHED +#include +#endif + #include "disasm.h" static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { @@ -19453,6 +19457,9 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: +#ifdef CONFIG_BPF_SCHED + case BPF_SCHED: +#endif if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -19627,10 +19634,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); +#ifdef CONFIG_BPF_SCHED + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) + return 0; +#else if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_EXT) return 0; +#endif ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); if (ret) @@ -19673,6 +19688,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } +#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } +#endif + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 0000000000000000000000000000000000000000..e2525bd60abf30d9f89c51a547b00bc9125e20bd --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "sched.h" + +DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f4eba861b11b0aae6991d348dce0e..d44c584d9bc74a03b11de3cf1bd5f0689b4f6137 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -108,3 +108,7 @@ #ifdef CONFIG_SCHED_AUTOGROUP # include "autogroup.c" #endif + +#ifdef CONFIG_BPF_SCHED +# include "bpf_sched.c" +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bafb7b440263e38784aea95860fe18ef15baa9ce..f49884275d022019bd0d71897fbe395531d159bb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2469,7 +2469,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ +#ifdef CONFIG_BPF_SCHED +inline bool is_cpu_allowed(struct task_struct *p, int cpu) +#else static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +#endif { /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) @@ -4542,6 +4546,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->migration_pending = NULL; #endif init_sched_mm_cid(p); +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -9953,6 +9960,10 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif +#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + void __init sched_init(void) { unsigned long ptr = 0; @@ -10008,6 +10019,13 @@ void __init sched_init(void) global_rt_period(), global_rt_runtime()); #endif /* CONFIG_RT_GROUP_SCHED */ +#if defined(CONFIG_CPUMASK_OFFSTACK) && defined(CONFIG_BPF_SCHED) + for_each_possible_cpu(i) { + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + } +#endif + #ifdef CONFIG_CGROUP_SCHED task_group_cache = KMEM_CACHE(task_group, 0); @@ -10494,6 +10512,13 @@ static void sched_unregister_group(struct task_group *tg) call_rcu(&tg->rcu, sched_free_group_rcu); } +#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -10514,6 +10539,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent); return tg; @@ -10601,6 +10630,14 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group sched_change_qos_group(tsk, group); #endif +#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = group->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11400,6 +11437,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11470,6 +11581,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e32aae696ba7ce40e34a1fb3ab893469bdf760e..b0f65c68f6bb6c55122cce48a5b504dee420879a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -52,6 +52,7 @@ #include #include +#include #include "sched.h" #include "stats.h" @@ -99,6 +100,10 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -8468,6 +8473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY int idlest_cpu = -1; #endif +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + int ret; +#endif time = schedstat_start_time(); @@ -8502,6 +8511,25 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0 && is_cpu_allowed(p, ret)) { + rcu_read_unlock(); + return ret; + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -9879,9 +9907,26 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_node migrate_node; + int ret; +#endif lockdep_assert_rq_held(env->src_rq); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + migrate_node.src_cpu = env->src_cpu; + migrate_node.src_node = cpu_to_node(env->src_cpu); + migrate_node.dst_cpu = env->dst_cpu; + migrate_node.dst_node = cpu_to_node(env->dst_cpu); + + ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node); + if (!ret) + return ret; + } +#endif + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 19fe3c72e3fa72bfbdab19bfe22c4870f047bab5..b7ba6933745375a1f9bf5748eb06507552f8b667 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -419,6 +419,9 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_BPF_SCHED + long tag; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -453,6 +456,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) } extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); @@ -3615,4 +3621,8 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_BPF_SCHED +inline bool is_cpu_allowed(struct task_struct *p, int cpu); +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 61b7dddedc461e2ece91a7b25bcf14987fc98886..a8a547fdb321e724f22e748ffba05481a1205be0 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -700,6 +700,7 @@ class PrinterHelpers(Printer): 'struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct sched_migrate_ctx', ] known_types = { '...', @@ -755,6 +756,7 @@ class PrinterHelpers(Printer): 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct sched_migrate_ctx', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..9dd0b85549b6947e68ce1b4002f341ba73ce75c0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -1040,6 +1041,7 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE };