diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 300444226d6ab7efdad014e46cbf1d4dd9c9e6eb..81909a59d98d955164b3966ed7f3fd513de7940a 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -235,6 +235,7 @@ CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_KALLSYMS_BASE_RELATIVE=y # CONFIG_BPF_LSM is not set +CONFIG_BPF_SCHED=y CONFIG_BPF_SYSCALL=y CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y diff --git a/fs/proc/base.c b/fs/proc/base.c index fe2633176ed5395eca696700e704ad321079ee1f..98bfd18e61bc359f682f048e5f26f9221864a190 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3646,6 +3646,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, }; +#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3755,6 +3817,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ASCEND_SHARE_POOL ONE("sp_group", 0444, proc_sp_group_state), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd2493d2787ce9ea689d9c5d50d539453eb76f0 --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include + +#ifdef CONFIG_BPF_SCHED + +#include + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline bool bpf_sched_enabled(void) +{ + return false; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 0000000000000000000000000000000000000000..0c7ee492edde392c39d280ca8f8ff3cb4949e461 --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_cpumask_info { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a8137bb6dd3c223e42426c3b6d6442af0d63c762..5732b485c53991f6c92322cd0a788e83d08b2349 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -77,6 +77,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index 888e5cc2fb6a2651b574fe4f9df18f2ad7d10366..60749a11253a2eec0350026c32d2806989cec7ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1416,7 +1416,12 @@ struct task_struct { KABI_RESERVE(4) KABI_RESERVE(5) #endif +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a task */ + KABI_USE(6, long tag) +#else KABI_RESERVE(6) +#endif KABI_RESERVE(7) KABI_RESERVE(8) KABI_RESERVE(9) @@ -2201,4 +2206,96 @@ static inline int sched_qos_cpu_overload(void) return 0; } #endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_stats { + /* load/util */ + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; + + /* nr_running */ + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; + + /* idle statistics */ + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; + + /* capacity */ + unsigned long capacity; + unsigned long capacity_orig; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; + +struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; + +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; +#endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..818b1244a018f55d774b8929bf4de9c7b0b80224 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, + struct sched_entity *se) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, + struct sched_entity *next) +BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd566bfc843f78806c6022e98f67a56fb5e0a41c..374250bcf9d16fb3fecc04b7559b434995ad0513 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -10,6 +10,7 @@ #include #include +#include /* Extended instruction set based on top of classic BPF */ @@ -199,6 +200,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + KABI_EXTEND_ENUM(BPF_PROG_TYPE_SCHED) }; enum bpf_attach_type { @@ -240,6 +242,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + KABI_BROKEN_INSERT_ENUM(BPF_SCHED) __MAX_BPF_ATTACH_TYPE }; @@ -3755,6 +3758,117 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * yes 1, no 0. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3915,6 +4029,18 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ + FN(sched_cpu_stats_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ + FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/init/Kconfig b/init/Kconfig index beb4a6d1cbcb896bcf6fda7492b6ce831cdd522b..3e9c0b2e09849f4204073942faf6644455519c04 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1755,6 +1755,16 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +config BPF_SCHED + bool "SCHED Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling policies. + + If you are unsure how to answer this question, answer N. + config BPF_SYSCALL bool "Enable bpf() system call" select BPF diff --git a/init/init_task.c b/init/init_task.c index 891007de2eef4f5a1fa3f00d2d84fc97c989cd49..49418809de269f59944026ac70aad654da40af2e 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -216,6 +216,9 @@ struct task_struct init_task #endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, #endif ._resvd = &init_task_struct_resvd, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fba28f17e61aa7b353cb8d42b3fef9eae740c0d3..9a0a9895ec62ad071a319eeb0e72aa16433e4dd8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4479,6 +4479,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; + case BPF_SCHED: case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4bb5921a7d2177b3c11883a15bee72f13207f0fe..5fccf33196b5dff4c249e4fc608128f076348342 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -658,6 +658,10 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -697,6 +701,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: break; } @@ -715,6 +723,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 419dbc3d060ee1fea6835a59291bf36b9c570d7e..2f4091da923fb1e00038f201bb18e192850d0b17 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -1997,6 +1998,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; default: return -EINVAL; @@ -2108,6 +2110,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_SCHED: return true; default: return false; @@ -2529,6 +2532,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link); +#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, tr_link->trampoline)); @@ -2608,6 +2616,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; default: err = -EINVAL; goto out_put_prog; @@ -2710,6 +2724,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; } +#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr; @@ -2838,6 +2857,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SCHED: if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 87becf77cc759f1155cf36fe6ebad225866519ed..81d008130cb1a1ca215cd1a4308fe1fd0a4803d6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -357,6 +357,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_SCHED: case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcc8133b3708c87f64bac1fe5dc1d137b1460cdd..c91533a54121781f808abd0b76110a905ed31760 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "disasm.h" @@ -5018,10 +5019,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } @@ -12155,6 +12156,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_SCHED: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -12260,7 +12262,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && - prog->type != BPF_PROG_TYPE_EXT) + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) return 0; ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); @@ -12300,6 +12303,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } + if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + key = bpf_trampoline_compute_key(tgt_prog, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 978fcfca5871d76524efb5268288f59dd5e20f71..6f3106774d055ba3568c43afc86d0a8a84e8e405 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,5 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_topology.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 0000000000000000000000000000000000000000..220ba83fc5f4a0fae183db3d32b320ad9f30ab66 --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "sched.h" + +DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_cpumask_info_proto __weak; + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +BPF_CALL_3(bpf_sched_cpu_stats_of, int, cpu, + struct bpf_sched_cpu_stats *, ctx, + int, len) +{ + struct cpuidle_state *idle; + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + rq = cpu_rq(cpu); + memset(ctx, 0, sizeof(struct bpf_sched_cpu_stats)); + + /* load/util */ +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + /* nr_running */ + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + /* idle statistics */ + ctx->available_idle = available_idle_cpu(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + /* capacity */ +#ifdef CONFIG_SMP + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { + .func = bpf_sched_cpu_stats_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BTF_ID_LIST_SINGLE(btf_sched_entity_ids, struct, sched_entity) +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) +{ + if ((unsigned int)src_cpu >= nr_cpu_ids || + (unsigned int)dst_cpu >= nr_cpu_ids) + return 0; + + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_sched_cpu_stats_of: + return &bpf_sched_cpu_stats_of_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_cpumask_info: + return &bpf_get_cpumask_info_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; + +BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + if (tg == NULL) + return -EINVAL; + ret = tg->tag; +#endif + + return ret; +} + +const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; + +BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + if (tsk == NULL) + return -EINVAL; + return tsk->tag; +} + +const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; + +BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == NULL || tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + if (tsk == NULL) + return -EINVAL; + + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 0000000000000000000000000000000000000000..a843aebd0c14f1e81f82bb25f5e0a5c082a07961 --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *cpu_topology, int cpu) +{ + cpu_topology->cpu = cpu; + cpu_topology->core_id = topology_core_id(cpu); + cpu_topology->cluster_id = topology_cluster_id(cpu); + cpu_topology->die_id = topology_die_id(cpu); + cpu_topology->physical_package_id = topology_physical_package_id(cpu); + cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_1(bpf_init_cpu_topology, struct bpf_map *, map) +{ + const struct cpumask *cpu_map = cpu_active_mask; + struct bpf_cpu_topology *topo; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + topo = map->ops->map_lookup_elem(map, &i); + if (!topo) + return -ENOMEM; + + bpf_update_cpu_topology(topo, i); + } + + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_cpu_topology_ids, struct, bpf_cpu_topology) + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +}; + +BPF_CALL_2(bpf_get_cpumask_info, struct bpf_map *, map, struct bpf_cpumask_info *, cpus) +{ + if (!cpus) + return -EINVAL; + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_cpumask_info_proto = { + .func = bpf_get_cpumask_info, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, +}; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61b89cf6997679abbeeebff0adc64b92c250476b..181453cf3e7a02bf2394c06b818af2034b876fea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3477,6 +3477,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -8600,6 +8603,13 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -8620,6 +8630,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent); return tg; @@ -8691,6 +8705,14 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif +#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = tg->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); @@ -9463,6 +9485,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -9524,6 +9620,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe2f527c71edd776f7b934785cb22fbf2bdedea4..43881fe8f9593c770b172fee5588f8509cdae872 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,7 @@ #include #include #endif +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -508,6 +509,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + + if (ret == 1) + return 1; + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; } @@ -4434,6 +4444,21 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_tick(curr, delta_exec); + + if (ret < 0) + return; + else if (ret > 0) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, curr); + return; + } + } +#endif + if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -5678,6 +5703,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) assert_list_leaf_cfs_rq(rq); hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_enqueue_task(rq, p); +#endif } static void set_next_buddy(struct sched_entity *se); @@ -5752,6 +5782,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_dequeue_task(rq, p); +#endif } #ifdef CONFIG_SMP @@ -5967,6 +6002,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync); @@ -6839,6 +6890,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + cpumask_t *cpus_prev = NULL; + cpumask_t *cpus; + int ret; +#endif time = schedstat_start_time(); @@ -6860,6 +6917,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } else if (ret != -1) { + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + if (cpumask_subset(cpus, p->cpus_ptr) && + !cpumask_empty(cpus)) { + cpus_prev = (void *)p->cpus_ptr; + p->cpus_ptr = cpus; + } + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -6891,6 +6974,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (want_affine) current->recent_used_cpu = cpu; } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + + if (cpus_prev) + p->cpus_ptr = cpus_prev; + } +#endif + rcu_read_unlock(); schedstat_end_time(cpu_rq(cpu), time); @@ -7015,6 +7111,15 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_wakeup_preempt_entity(curr, se); + + if (ret) + return ret; + } +#endif + if (vdiff <= 0) return -1; @@ -7101,6 +7206,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ likely(!task_has_idle_policy(p))) goto preempt; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_wakeup(current, p); + + if (ret < 0) + return; + else if (ret > 0) + goto preempt; + } +#endif + /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ad00ba5f6d82eeaaa3c567a146365dd90be4cf3d..0ab8e2532f2d663e2974521a1c555b9f14a15446 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -454,7 +454,12 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a group */ + KABI_USE(1, long tag) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -492,6 +497,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) } extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 31484377b8b11a69c269285f757dd5094b1e46ed..fc51d6f0d447e5d81b1a9c42fca9a11c186a8fca 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,15 @@ class PrinterHelpers(Printer): 'struct xdp_md', 'struct path', 'struct btf_ptr', + 'struct task_group', + 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', + 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', ] known_types = { '...', @@ -478,6 +487,15 @@ class PrinterHelpers(Printer): 'struct task_struct', 'struct path', 'struct btf_ptr', + 'struct task_group', + 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', + 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', } mapped_types = { 'u8': '__u8', diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 6ebf2b215ef49698fc4601339099260380e1b126..81def197e774485c7ea7e1f72ff6ca7da560f779 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -66,6 +66,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_MODIFY_RETURN] = "mod_ret", [BPF_LSM_MAC] = "lsm_mac", [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_SCHED] = "sched", }; void p_err(const char *fmt, ...) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 592536904dde2d59a152c838a39273111df77495..4e1d8e57d951a31b55624445561d8f9f0e636777 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -64,6 +64,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SCHED] = "sched", }; const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4af3661414fa9ba2e6f13af389ecc167fea04da1..abf8023d606b631966d3a0f1d848c735fad73e1a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -199,6 +199,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, +#ifndef __GENKSYMS__ + BPF_PROG_TYPE_SCHED, +#endif }; enum bpf_attach_type { @@ -240,6 +243,9 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, +#ifndef __GENKSYMS__ + BPF_SCHED, +#endif __MAX_BPF_ATTACH_TYPE }; @@ -3755,6 +3761,117 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3915,6 +4032,18 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ + FN(sched_cpu_stats_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ + FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973b52e1152c955fbaac831feeeecc0..13e08c0d8b1a3084bb64f4c71c1b78e0d7aadbeb 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -236,7 +236,8 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, attr.prog_type == BPF_PROG_TYPE_LSM) { attr.attach_btf_id = load_attr->attach_btf_id; } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || - attr.prog_type == BPF_PROG_TYPE_EXT) { + attr.prog_type == BPF_PROG_TYPE_EXT || + attr.prog_type == BPF_PROG_TYPE_SCHED) { attr.attach_btf_id = load_attr->attach_btf_id; attr.attach_prog_fd = load_attr->attach_prog_fd; } else { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2531a2a0ff6cf01d0c1f95a9052343c16def86df..9dd29b39010c9cec80ea972275e92b9a7d2b041f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2504,7 +2504,8 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true; /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -6722,7 +6723,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, prog->type == BPF_PROG_TYPE_LSM) { load_attr.attach_btf_id = prog->attach_btf_id; } else if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) { load_attr.attach_prog_fd = prog->attach_prog_fd; load_attr.attach_btf_id = prog->attach_btf_id; } else { @@ -6829,7 +6831,8 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if ((prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) && !prog->attach_btf_id) { btf_id = libbpf_find_attach_btf_id(prog); if (btf_id <= 0) return btf_id; @@ -8254,6 +8257,7 @@ BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); +BPF_PROG_TYPE_FNS(sched, BPF_PROG_TYPE_SCHED); enum bpf_attach_type bpf_program__get_expected_attach_type(struct bpf_program *prog) @@ -8318,6 +8322,8 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog); static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, struct bpf_program *prog); +static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog); static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), @@ -8386,6 +8392,10 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), + SEC_DEF("sched/", SCHED, + .is_attach_btf = true, + .expected_attach_type = BPF_SCHED, + .attach_fn = attach_sched), BPF_EAPROG_SEC("xdp_devmap/", BPF_PROG_TYPE_XDP, BPF_XDP_DEVMAP), BPF_EAPROG_SEC("xdp_cpumap/", BPF_PROG_TYPE_XDP, @@ -8469,7 +8479,7 @@ static const struct bpf_sec_def section_defs[] = { #undef BPF_APROG_COMPAT #undef SEC_DEF -#define MAX_TYPE_NAME_SIZE 32 +#define MAX_TYPE_NAME_SIZE 31 static const struct bpf_sec_def *find_sec_def(const char *sec_name) { @@ -8673,6 +8683,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, @@ -8706,6 +8717,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else if (attach_type == BPF_TRACE_ITER) err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name, BTF_KIND_FUNC); + else if (attach_type == BPF_SCHED) + err = find_btf_by_prefix_kind(btf, BTF_SCHED_PREFIX, name, + BTF_KIND_FUNC); else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); @@ -9685,6 +9699,11 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return bpf_program__attach_btf_id(prog); } +struct bpf_link *bpf_program__attach_sched(struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog); +} + struct bpf_link *bpf_program__attach_lsm(struct bpf_program *prog) { return bpf_program__attach_btf_id(prog); @@ -9696,6 +9715,12 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, return bpf_program__attach_trace(prog); } +static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_sched(prog); +} + static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 57d10b779dea019e05a218a8f0e965b84988088c..a011179f705d0559b6b4a618c009269e03923ddc 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -264,6 +264,8 @@ bpf_program__attach_xdp(struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(struct bpf_program *prog); struct bpf_map; @@ -360,6 +362,7 @@ LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched(struct bpf_program *prog); LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, @@ -388,6 +391,7 @@ LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched(const struct bpf_program *prog); /* * No need for __attribute__((packed)), all members of 'bpf_map_def' diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47d27a3e66bb23aac6d90767df67ed..16393cea53d016c5ee7d20c342b991d1898050b6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -336,4 +336,7 @@ LIBBPF_0.2.0 { perf_buffer__epoll_fd; perf_buffer__consume_buffer; xsk_socket__create_shared; + bpf_program__attach_sched; + bpf_program__is_sched; + bpf_program__set_sched; } LIBBPF_0.1.0; diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..b2008ff6bb253305627c64c2db007ca2625e6d9d --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,510 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include +#include +#include +#include +#include + +/* set bigger value may lead verifier failed */ +#define BPF_SCHED_LOOP_MAX 1024 +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1,\ + (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start),\ + true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)),\ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct bpf_cpumask_info); + __uint(max_entries, 1); +} map_cpumask_info SEC(".maps"); + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, + struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, + struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) +{ + int se_tag = 0; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} +#endif