diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c35478e92a9f6bef67013e16e6cd3183..db6bdb6f7de2abc816df8392d4d3a2ee2e7b2355 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3588,6 +3588,67 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, }; +#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + int err = 0; + long tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) { + err = -ESRCH; + goto out; + } + + if (unlikely(tsk->pid == 1)) { + err = -EPERM; + goto out; + } + + + err = kstrtol_from_user(buf, count, 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3691,6 +3752,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd2493d2787ce9ea689d9c5d50d539453eb76f0 --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include + +#ifdef CONFIG_BPF_SCHED + +#include + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline bool bpf_sched_enabled(void) +{ + return false; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c68760b37872634fa3a0c0a0870c6066..dd79463eea4e31989cb816009f5d8ca039c75827 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, struct bpf_nf_ctx, struct bpf_nf_ctx) #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3520e3fbaa916670190eea018a4a6a01f78d5010..3e3a5a5ffa3931fdba201ee35bd5e062933f4978 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1537,6 +1537,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif +#ifdef CONFIG_BPF_SCHED + long tag; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -2469,4 +2473,38 @@ static inline int sched_qos_cpu_overload(void) } #endif +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_stats { + /* nr_running */ + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +}; + +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_migrate_node { + int src_cpu; + int src_node; + int dst_cpu; + int dst_node; +}; + +#endif + #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..3d9fa7aed10aa9f9ed59228fb940f18f55984b90 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..87914a0fc2e396d8a49faaaa7ab097d5cd046d2e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -1040,6 +1041,7 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE }; @@ -5642,6 +5644,12 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5856,6 +5864,7 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(sched_cpu_stats_of, 212, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b1c01c0d99d605d5f0c22693470d3..d377a089f002e101622c04ee028fa6e23fea5b95 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -210,6 +210,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 6a906ff930065268dbdf4e5c4ea0bb3f851bcb2f..bf20943ff18caf851152fa1e410126389d15a846 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -100,4 +100,14 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +config BPF_SCHED + bool "Sched Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling policies. + + If you are unsure how to answer this question, answer N. + endmenu # "BPF subsystem" diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8090d7fb11ef686019b4734bbc7151057ee78efa..133805b4bc71ebc0a2a783a3e186adbb490fc563 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5982,6 +5982,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; + case BPF_SCHED: case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d77b2f8b93641b4445645ec2d9397849fe6a5af3..aea7fe557d63562a5d41a15fafad94f7e0ee7bc4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -2412,6 +2413,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; default: return -EINVAL; @@ -2539,6 +2541,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_SCHED: return true; default: return false; @@ -3025,6 +3028,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); +#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); @@ -3115,6 +3123,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; default: err = -EINVAL; goto out_put_prog; @@ -3234,6 +3248,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; } +#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr; @@ -3582,6 +3601,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SCHED: if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -3717,6 +3737,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return BPF_PROG_TYPE_SCHED_CLS; + case BPF_SCHED: + return BPF_PROG_TYPE_SCHED; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3744,6 +3766,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, -EINVAL : 0; case BPF_PROG_TYPE_EXT: return 0; + case BPF_PROG_TYPE_SCHED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; case BPF_PROG_TYPE_NETFILTER: if (attach_type != BPF_NETFILTER) return -EINVAL; @@ -4922,6 +4948,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e97aeda3a86b55522a73e53279543cb5b4df6919..2aa01c26f6a4e021ed8f035632c688baeb40d3d5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -493,6 +493,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_SCHED: case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 824531d4c262a3409dffd3138e86776e8a770ebf..c8da0be7d576c7fac80165af2f4b1ebd1e1fbd92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "disasm.h" @@ -19453,6 +19454,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_SCHED: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -19629,7 +19631,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && - prog->type != BPF_PROG_TYPE_EXT) + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) return 0; ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); @@ -19673,6 +19676,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 0000000000000000000000000000000000000000..1ddff44b6a93e4ae98ea947cc659a1363e09b3b0 --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "sched.h" + +DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +BPF_CALL_3(bpf_sched_cpu_stats_of, int *, cpuid, + struct bpf_sched_cpu_stats *, ctx, + int, len) +{ + struct rq *rq; + int cpu = *cpuid; + + if ((unsigned int)cpu >= nr_cpu_ids) { + memset(ctx, 0, len); + return -EINVAL; + } + + rq = cpu_rq(cpu); + memset(ctx, 0, len); + + SCHED_WARN_ON(!rcu_read_lock_held()); + /* nr_running */ + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { + .func = bpf_sched_cpu_stats_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_INT, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_sched_cpu_stats_of: + return &bpf_sched_cpu_stats_of_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f4eba861b11b0aae6991d348dce0e..d44c584d9bc74a03b11de3cf1bd5f0689b4f6137 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -108,3 +108,7 @@ #ifdef CONFIG_SCHED_AUTOGROUP # include "autogroup.c" #endif + +#ifdef CONFIG_BPF_SCHED +# include "bpf_sched.c" +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1c73dea1f778c4038fef05cab1875335bc3dd17..0898d8c5d9c3498c816e0614019cfd418b41fa60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2468,7 +2468,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ +#ifdef CONFIG_BPF_SCHED +inline bool is_cpu_allowed(struct task_struct *p, int cpu) +#else static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +#endif { /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) @@ -4541,6 +4545,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->migration_pending = NULL; #endif init_sched_mm_cid(p); +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -9921,6 +9928,10 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif +#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + void __init sched_init(void) { unsigned long ptr = 0; @@ -9976,6 +9987,13 @@ void __init sched_init(void) global_rt_period(), global_rt_runtime()); #endif /* CONFIG_RT_GROUP_SCHED */ +#if defined(CONFIG_CPUMASK_OFFSTACK) && defined(CONFIG_BPF_SCHED) + for_each_possible_cpu(i) { + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + } +#endif + #ifdef CONFIG_CGROUP_SCHED task_group_cache = KMEM_CACHE(task_group, 0); @@ -10459,6 +10477,13 @@ static void sched_unregister_group(struct task_group *tg) call_rcu(&tg->rcu, sched_free_group_rcu); } +#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -10479,6 +10504,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent); return tg; @@ -10566,6 +10595,14 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group sched_change_qos_group(tsk, group); #endif +#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = group->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); @@ -11349,12 +11386,86 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, done: return 0; } +#endif static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, struct cftype *cft) { return css_tg(css)->qos_level; } + +#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} #endif static struct cftype cpu_legacy_files[] = { @@ -11427,6 +11538,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0de55884f9dadb7fc976824b16c1ddfd054bf668..2d2db34a69340da3e4a89d6b20edf2f54399a3cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -52,6 +52,7 @@ #include #include +#include #include "sched.h" #include "stats.h" @@ -91,6 +92,10 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -8094,6 +8099,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + int ret; +#endif + /* * required for stable ->cpus_allowed */ @@ -8116,6 +8126,25 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0 && is_cpu_allowed(p, ret)) { + rcu_read_unlock(); + return ret; + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -9149,9 +9178,26 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_node migrate_node; + int ret; +#endif lockdep_assert_rq_held(env->src_rq); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + migrate_node.src_cpu = env->src_cpu; + migrate_node.src_node = cpu_to_node(env->src_cpu); + migrate_node.dst_cpu = env->dst_cpu; + migrate_node.dst_node = cpu_to_node(env->dst_cpu); + + ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node); + if (!ret) + return ret; + } +#endif + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3de84e95baf1cbee2a0a79d9bdcdb1f4b960b103..53cc14e49acda7d1215500c979987aea833b3712 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -416,6 +416,9 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_BPF_SCHED + long tag; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -450,6 +453,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) } extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); @@ -3540,4 +3546,8 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_BPF_SCHED +inline bool is_cpu_allowed(struct task_struct *p, int cpu); +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 61b7dddedc461e2ece91a7b25bcf14987fc98886..359373bc8dab0cb7a86ce1066117aa9845e56909 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -700,6 +700,8 @@ class PrinterHelpers(Printer): 'struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct bpf_sched_cpu_stats', + 'struct sched_migrate_ctx', ] known_types = { '...', @@ -755,6 +757,8 @@ class PrinterHelpers(Printer): 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct bpf_sched_cpu_stats', + 'struct sched_migrate_ctx', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..87914a0fc2e396d8a49faaaa7ab097d5cd046d2e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -988,6 +988,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -1040,6 +1041,7 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE }; @@ -5642,6 +5644,12 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5856,6 +5864,7 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(sched_cpu_stats_of, 212, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index b0f1913763a33250987aaf4112682df4fdc6895e..ddbb16be651fa6b19df926693ae02a2940999fe4 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -781,6 +781,7 @@ int bpf_link_create(int prog_fd, int target_fd, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_SCHED: case BPF_LSM_MAC: attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); if (!OPTS_ZEROED(opts, tracing)) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 96ff1aa4bf6a0ebb49abf2d4f886c66f0d522da3..41697c6274b916e666d570cb09b8e0a046dbb39b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -121,6 +121,7 @@ static const char * const attach_type_name[] = { [BPF_TCX_INGRESS] = "tcx_ingress", [BPF_TCX_EGRESS] = "tcx_egress", [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", + [BPF_SCHED] = "sched", }; static const char * const link_type_name[] = { @@ -209,6 +210,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", [BPF_PROG_TYPE_NETFILTER] = "netfilter", + [BPF_PROG_TYPE_SCHED] = "sched", }; static int __base_pr(enum libbpf_print_level level, const char *format, @@ -3029,7 +3031,8 @@ static int bpf_object_fixup_btf(struct bpf_object *obj) static bool prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true; /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -8764,6 +8767,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link); static const struct bpf_sec_def section_defs[] = { SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), @@ -8858,6 +8862,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), + SEC_DEF("sched/", SCHED, BPF_SCHED, SEC_ATTACH_BTF, attach_sched), }; int libbpf_register_prog_handler(const char *sec, @@ -9237,6 +9242,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128 void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, @@ -9256,6 +9262,10 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *prefix = BTF_ITER_PREFIX; *kind = BTF_KIND_FUNC; break; + case BPF_SCHED: + *prefix = BTF_SCHED_PREFIX; + *kind = BTF_KIND_FUNC; + break; default: *prefix = ""; *kind = BTF_KIND_FUNC; @@ -12113,6 +12123,17 @@ struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, return link; } +struct bpf_link *bpf_program__attach_sched(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_sched(prog); + return libbpf_get_error(*link); +} + struct bpf_link *bpf_program__attach(const struct bpf_program *prog) { struct bpf_link *link = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0e52621cba435ae31feaa500300eb77a74e08f2a..aabdd973c1a589b5c2f89d90a939deeeec30cef6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -769,6 +769,8 @@ bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(const struct bpf_program *prog); struct bpf_netfilter_opts { /* size of this struct, for forward/backward compatibility */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 57712321490f8e0d7569b0733d987b678827c6da..228ab00a5e6909d86e5071462c2ff4770420b33b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -236,6 +236,7 @@ LIBBPF_0.2.0 { perf_buffer__buffer_fd; perf_buffer__epoll_fd; perf_buffer__consume_buffer; + bpf_program__attach_sched; } LIBBPF_0.1.0; LIBBPF_0.3.0 {