From b349957869de670b1163922a4d41f49b9d97fe93 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Nov 2022 11:56:09 +0800 Subject: [PATCH 01/23] bpf: sched: basic infrastructure for scheduler bpf maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/ ------------------- This commit introduces basic definitions and infrastructure for scheduler bpf programs. It defines the BPF_PROG_TYPE_SCHED program type and the BPF_SCHED attachment type. The implementation is inspired by lsm bpf programs and is based on kretprobes. This will allow to add new hooks with a minimal changes to the kernel code and without any changes to libbpf/bpftool. It's very convenient as I anticipate a large number of private patches being used for a long time before (or if at all) reaching upstream. Sched programs are expected to return an int, which meaning will be context defined. This patch doesn't add any real scheduler hooks (only a stub), it will be done by following patches in the series. Scheduler bpf programs as now are very restricted in what they can do: only the bpf_printk() helper is available. The scheduler context can impose significant restrictions on what's safe and what's not. So let's extend their abilities on case by case basis when a need arise. Signed-off-by: Roman Gushchin Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/bpf_sched.h | 26 ++++++++++++++ include/linux/bpf_types.h | 4 +++ include/linux/sched_hook_defs.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ init/Kconfig | 10 ++++++ kernel/bpf/btf.c | 1 + kernel/bpf/syscall.c | 9 +++++ kernel/bpf/trampoline.c | 1 + kernel/bpf/verifier.c | 11 +++++- kernel/sched/Makefile | 1 + kernel/sched/bpf_sched.c | 62 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 12 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_sched.h create mode 100644 include/linux/sched_hook_defs.h create mode 100644 kernel/sched/bpf_sched.c diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 000000000000..874393e6a6aa --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include + +#ifdef CONFIG_BPF_SCHED + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a8137bb6dd3c..5732b485c539 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -77,6 +77,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 000000000000..14344004e335 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, 0, dummy, void) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd566bfc843f..894cc46f3a54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -199,6 +199,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -240,6 +241,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE }; diff --git a/init/Kconfig b/init/Kconfig index beb4a6d1cbcb..3e9c0b2e0984 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1755,6 +1755,16 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +config BPF_SCHED + bool "SCHED Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling policies. + + If you are unsure how to answer this question, answer N. + config BPF_SYSCALL bool "Enable bpf() system call" select BPF diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fba28f17e61a..9a0a9895ec62 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4479,6 +4479,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; + case BPF_SCHED: case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 419dbc3d060e..ff65862ae5ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1997,6 +1997,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; default: return -EINVAL; @@ -2108,6 +2109,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_SCHED: return true; default: return false; @@ -2608,6 +2610,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; default: err = -EINVAL; goto out_put_prog; @@ -2838,6 +2846,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SCHED: if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 87becf77cc75..81d008130cb1 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -357,6 +357,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_SCHED: case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcc8133b3708..082d7062a4aa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "disasm.h" @@ -12155,6 +12156,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_SCHED: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -12260,7 +12262,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && - prog->type != BPF_PROG_TYPE_EXT) + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) return 0; ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); @@ -12300,6 +12303,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } + if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + key = bpf_trampoline_compute_key(tgt_prog, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 978fcfca5871..a67401cc4d5d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 000000000000..2360404d4a07 --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "sched.h" + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4af3661414fa..c45d289ba04f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -199,6 +199,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SCHED, }; enum bpf_attach_type { @@ -240,6 +241,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE }; -- Gitee From d2d3b8e656fe494a16cd4b7f869e89b88862a3c4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Nov 2022 11:56:10 +0800 Subject: [PATCH 02/23] bpf: sched: introduce bpf_sched_enable() maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/ ------------------- Introduce a dedicated static key and the bpf_sched_enabled() wrapper to guard all invocations of bpf programs in the scheduler code. It will help to avoid any potential performance regression in a case when no scheduler bpf programs are attached. Signed-off-by: Roman Gushchin Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/bpf_sched.h | 24 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/sched/bpf_sched.c | 2 ++ 3 files changed, 37 insertions(+) diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h index 874393e6a6aa..9cd2493d2787 100644 --- a/include/linux/bpf_sched.h +++ b/include/linux/bpf_sched.h @@ -6,6 +6,8 @@ #ifdef CONFIG_BPF_SCHED +#include + #define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_sched_##NAME(__VA_ARGS__); #include @@ -14,6 +16,23 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + #else /* !CONFIG_BPF_SCHED */ static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, @@ -22,5 +41,10 @@ static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline bool bpf_sched_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SCHED */ #endif /* _LINUX_BPF_SCHED_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff65862ae5ce..2f4091da923f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2531,6 +2532,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link); +#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, tr_link->trampoline)); @@ -2718,6 +2724,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; } +#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr; diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2360404d4a07..e2525bd60abf 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -6,6 +6,8 @@ #include #include "sched.h" +DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + /* * For every hook declare a nop function where a BPF program can be attached. */ -- Gitee From e63dc65700ef603c66a1c241899e3c1314e073e9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Nov 2022 11:56:11 +0800 Subject: [PATCH 03/23] libbpf: add support for scheduler bpf programs maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/ ------------------- This patch adds a support for loading and attaching scheduler bpf programs. Signed-off-by: Roman Gushchin Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- tools/lib/bpf/libbpf.c | 30 +++++++++++++++++++++++++++--- tools/lib/bpf/libbpf.h | 4 ++++ tools/lib/bpf/libbpf.map | 3 +++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2531a2a0ff6c..c24f9c073362 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2504,7 +2504,8 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true; /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -6829,7 +6830,8 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if ((prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) && !prog->attach_btf_id) { btf_id = libbpf_find_attach_btf_id(prog); if (btf_id <= 0) return btf_id; @@ -8254,6 +8256,7 @@ BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); +BPF_PROG_TYPE_FNS(sched, BPF_PROG_TYPE_SCHED); enum bpf_attach_type bpf_program__get_expected_attach_type(struct bpf_program *prog) @@ -8318,6 +8321,8 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog); static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, struct bpf_program *prog); +static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog); static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), @@ -8386,6 +8391,10 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), + SEC_DEF("sched/", SCHED, + .is_attach_btf = true, + .expected_attach_type = BPF_SCHED, + .attach_fn = attach_sched), BPF_EAPROG_SEC("xdp_devmap/", BPF_PROG_TYPE_XDP, BPF_XDP_DEVMAP), BPF_EAPROG_SEC("xdp_cpumap/", BPF_PROG_TYPE_XDP, @@ -8469,7 +8478,7 @@ static const struct bpf_sec_def section_defs[] = { #undef BPF_APROG_COMPAT #undef SEC_DEF -#define MAX_TYPE_NAME_SIZE 32 +#define MAX_TYPE_NAME_SIZE 31 static const struct bpf_sec_def *find_sec_def(const char *sec_name) { @@ -8673,6 +8682,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, @@ -8706,6 +8716,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else if (attach_type == BPF_TRACE_ITER) err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name, BTF_KIND_FUNC); + else if (attach_type == BPF_SCHED) + err = find_btf_by_prefix_kind(btf, BTF_SCHED_PREFIX, name, + BTF_KIND_FUNC); else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); @@ -9685,6 +9698,11 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return bpf_program__attach_btf_id(prog); } +struct bpf_link *bpf_program__attach_sched(struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog); +} + struct bpf_link *bpf_program__attach_lsm(struct bpf_program *prog) { return bpf_program__attach_btf_id(prog); @@ -9696,6 +9714,12 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, return bpf_program__attach_trace(prog); } +static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_sched(prog); +} + static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 57d10b779dea..a011179f705d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -264,6 +264,8 @@ bpf_program__attach_xdp(struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(struct bpf_program *prog); struct bpf_map; @@ -360,6 +362,7 @@ LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched(struct bpf_program *prog); LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, @@ -388,6 +391,7 @@ LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched(const struct bpf_program *prog); /* * No need for __attribute__((packed)), all members of 'bpf_map_def' diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47..16393cea53d0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -336,4 +336,7 @@ LIBBPF_0.2.0 { perf_buffer__epoll_fd; perf_buffer__consume_buffer; xsk_socket__create_shared; + bpf_program__attach_sched; + bpf_program__is_sched; + bpf_program__set_sched; } LIBBPF_0.1.0; -- Gitee From e0a2b74fdec3bdaaf9d3ddf5673c1e5276f845a7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Nov 2022 11:56:12 +0800 Subject: [PATCH 04/23] bpftool: recognize scheduler programs maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/ ------------------- Teach bpftool to recognize scheduler bpf programs. Signed-off-by: Roman Gushchin Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/lib/bpf/bpf.c | 3 ++- tools/lib/bpf/libbpf.c | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 6ebf2b215ef4..81def197e774 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -66,6 +66,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_MODIFY_RETURN] = "mod_ret", [BPF_LSM_MAC] = "lsm_mac", [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_SCHED] = "sched", }; void p_err(const char *fmt, ...) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 592536904dde..4e1d8e57d951 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -64,6 +64,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SCHED] = "sched", }; const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973..13e08c0d8b1a 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -236,7 +236,8 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, attr.prog_type == BPF_PROG_TYPE_LSM) { attr.attach_btf_id = load_attr->attach_btf_id; } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || - attr.prog_type == BPF_PROG_TYPE_EXT) { + attr.prog_type == BPF_PROG_TYPE_EXT || + attr.prog_type == BPF_PROG_TYPE_SCHED) { attr.attach_btf_id = load_attr->attach_btf_id; attr.attach_prog_fd = load_attr->attach_prog_fd; } else { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c24f9c073362..9dd29b39010c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6723,7 +6723,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, prog->type == BPF_PROG_TYPE_LSM) { load_attr.attach_btf_id = prog->attach_btf_id; } else if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) { load_attr.attach_prog_fd = prog->attach_prog_fd; load_attr.attach_btf_id = prog->attach_btf_id; } else { -- Gitee From 224980fd5760c75abb4b3e791fa3020a90f1cb4a Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:13 +0800 Subject: [PATCH 05/23] sched: programmable: Add a tag for the task hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add a tag for the task, useful to identify the special task. User can use the file system interface to mark different tags for specific workloads. The kernel subsystems can use the set_* helpers to mark it too. The bpf prog obtains the tags to detect different workloads. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/sched.h | 5 +++++ init/init_task.c | 3 +++ kernel/sched/core.c | 3 +++ 3 files changed, 11 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 888e5cc2fb6a..fbd0f947643c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1416,7 +1416,12 @@ struct task_struct { KABI_RESERVE(4) KABI_RESERVE(5) #endif +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a task */ + KABI_USE(6, long tag) +#else KABI_RESERVE(6) +#endif KABI_RESERVE(7) KABI_RESERVE(8) KABI_RESERVE(9) diff --git a/init/init_task.c b/init/init_task.c index 891007de2eef..49418809de26 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -216,6 +216,9 @@ struct task_struct init_task #endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, #endif ._resvd = &init_task_struct_resvd, }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61b89cf69976..2c30aceed1e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3477,6 +3477,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); -- Gitee From 51f84117e775e3ebe41e680952c2803115a79412 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 12:03:57 +0800 Subject: [PATCH 06/23] sched: programmable: Add a tag for the task group hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add a tag for the task group, to support the tag-based scheduling mechanism. The tag is used to identify a special task or a type of special tasks, there are many special tasks in the real world, such as foreground and background tasks, online and offline tasks, ect. so, we can identify such special tasks, and execute specific policies. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 5 +++++ 2 files changed, 24 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2c30aceed1e9..3e22147307e4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8603,6 +8603,13 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -8623,6 +8630,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent); return tg; @@ -8694,6 +8705,14 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif +#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = tg->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ad00ba5f6d82..49a10a4bd54c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -454,7 +454,12 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a group */ + KABI_USE(1, long tag) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) -- Gitee From 7a4a3fd97b1d4af74650a52e18a8eb1c222e97bc Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 12:03:58 +0800 Subject: [PATCH 07/23] sched: programmable: Add user interface of task group tag hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add user interface of task group tag, bridges the information gap between user-mode and kernel-mode. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/sched.h | 4 +++ kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 ++ 3 files changed, 88 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index fbd0f947643c..5d7d037bb797 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2206,4 +2206,8 @@ static inline int sched_qos_cpu_overload(void) return 0; } #endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e22147307e4..181453cf3e7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9485,6 +9485,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -9546,6 +9620,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 49a10a4bd54c..0ab8e2532f2d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -497,6 +497,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) } extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -- Gitee From d49f78d572a2d6153bc6c20815241efd788e1f10 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:16 +0800 Subject: [PATCH 08/23] sched: programmable: Add user interface of task tag hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add user interface of task tag, bridges the information gap between user-mode and kernel mode. To do: /proc/${pid}/task/${pid}/tag Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- fs/proc/base.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index fe2633176ed5..98bfd18e61bc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3646,6 +3646,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, }; +#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3755,6 +3817,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ASCEND_SHARE_POOL ONE("sp_group", 0444, proc_sp_group_state), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) -- Gitee From cb9a695e495cb31cd1d8c4bd9b5e547fa4b003e7 Mon Sep 17 00:00:00 2001 From: Ren Zhijie Date: Fri, 25 Nov 2022 11:56:17 +0800 Subject: [PATCH 09/23] sched: programmable: add bpf_sched_tg_tag_of helper function hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- This helper function read the task group tag for a task. The bpf prog obtains the tags to detect different workloads. Signed-off-by: Ren Zhijie Signed-off-by: Chen Hui Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/helpers.c | 3 +++ kernel/bpf/verifier.c | 4 ++-- kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 6 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 894cc46f3a54..58caaa161b5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3757,6 +3757,14 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3917,6 +3925,7 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_tg_tag_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4bb5921a7d21..0b3bd94ec195 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -658,6 +658,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -697,6 +698,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 082d7062a4aa..c91533a54121 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5019,10 +5019,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index e2525bd60abf..975c4f984856 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -62,3 +62,26 @@ const struct bpf_verifier_ops bpf_sched_verifier_ops = { .get_func_proto = bpf_sched_func_proto, .is_valid_access = btf_ctx_access, }; + +BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + if (tg == NULL) + return -EINVAL; + ret = tg->tag; +#endif + + return ret; +} + +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 31484377b8b1..40b845cd5126 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,7 @@ class PrinterHelpers(Printer): 'struct xdp_md', 'struct path', 'struct btf_ptr', + 'struct task_group', ] known_types = { '...', @@ -478,6 +479,7 @@ class PrinterHelpers(Printer): 'struct task_struct', 'struct path', 'struct btf_ptr', + 'struct task_group', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c45d289ba04f..49fea9655920 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3757,6 +3757,14 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3917,6 +3925,7 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_tg_tag_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 846c800429433bb100fd8ef359a706acfe530f06 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 12:13:02 +0800 Subject: [PATCH 10/23] sched: programmable: add bpf_sched_task_tag_of helper function hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- This helper function read the tag of the struct task. The bpf prog obtains the tags to detect different workloads. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/helpers.c | 3 +++ kernel/sched/bpf_sched.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 36 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 58caaa161b5a..d536d0373063 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3765,6 +3765,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3926,6 +3933,7 @@ union bpf_attr { FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b3bd94ec195..ef2d8cb87807 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -659,6 +659,7 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -700,6 +701,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_sched_tg_tag_of: return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 975c4f984856..ad42e521cee3 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -85,3 +85,20 @@ const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_sched_tg_ids[0], }; + +BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + if (tsk == NULL) + return -EINVAL; + return tsk->tag; +} + +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) + +const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 49fea9655920..ae938aef7532 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3765,6 +3765,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3926,6 +3933,7 @@ union bpf_attr { FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 7cdbdbf5b6c12b1bff2ffb6b4bf08910ecf1689a Mon Sep 17 00:00:00 2001 From: Ren Zhijie Date: Fri, 25 Nov 2022 11:56:19 +0800 Subject: [PATCH 11/23] sched: programmable: Add helpers to set tag of task or task_group hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add helper function bpf_sched_set_tg_tag() and bpf_sched_set_task_tag() to set tag for task group or task. They can not be call when rq->lock has been held. The use case is that the other kernel subsystems, such as the network, can use it to mark key tasks. Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 14 +++++++++++ kernel/bpf/helpers.c | 6 +++++ kernel/sched/bpf_sched.c | 45 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 14 +++++++++++ 4 files changed, 79 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d536d0373063..f3de7d387d5d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3772,6 +3772,18 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3934,6 +3946,8 @@ union bpf_attr { FN(sk_original_addr), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef2d8cb87807..5fccf33196b5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -660,6 +660,8 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -721,6 +723,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index ad42e521cee3..31a76cf337e6 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -102,3 +102,48 @@ const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_sched_task_ids[0], }; + +BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == NULL || tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + if (tsk == NULL) + return -EINVAL; + + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ae938aef7532..5249add91b7c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3772,6 +3772,18 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3934,6 +3946,8 @@ union bpf_attr { FN(sk_original_addr), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 3c2d0f4d8e88d5ea5d05e62577db32210388ed1e Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 12:05:42 +0800 Subject: [PATCH 12/23] bpf: sched: Add helper functions to get cpu statistics hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add the helper functions to get cpu statistics, as follows: 1.acquire cfs/rt/irq cpu load statitic. 2.acquire multiple types of nr_running statitic. 3.acquire cpu idle statitic. 4.acquire cpu capacity. Based on CPU statistics in different dimensions, specific scheduling policies can be implemented in bpf program. Signed-off-by: Chen Hui Signed-off-by: Hui Tang Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/sched.h | 32 ++++++++++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/sched/bpf_sched.c | 70 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 118 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d7d037bb797..1bccbf4040b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2209,5 +2209,37 @@ static inline int sched_qos_cpu_overload(void) #ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_stats { + /* load/util */ + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; + + /* nr_running */ + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; + + /* idle statistics */ + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; + + /* capacity */ + unsigned long capacity; + unsigned long capacity_orig; +}; + #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3de7d387d5d..6d0cdc618f2f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3784,6 +3784,12 @@ union bpf_attr { * Set tag to *tsk*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3954,7 @@ union bpf_attr { FN(sched_task_tag_of), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_stats_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 31a76cf337e6..26954d74b0e9 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -44,12 +44,82 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return 0; } +BPF_CALL_3(bpf_sched_cpu_stats_of, int, cpu, + struct bpf_sched_cpu_stats *, ctx, + int, len) +{ + struct cpuidle_state *idle; + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + rq = cpu_rq(cpu); + memset(ctx, 0, sizeof(struct bpf_sched_cpu_stats)); + + /* load/util */ +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + /* nr_running */ + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + /* idle statistics */ + ctx->available_idle = available_idle_cpu(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + /* capacity */ +#ifdef CONFIG_SMP + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { + .func = bpf_sched_cpu_stats_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_sched_cpu_stats_of: + return &bpf_sched_cpu_stats_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 40b845cd5126..383fea7d1d18 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct task_group', + 'struct bpf_sched_cpu_stats', ] known_types = { '...', @@ -480,6 +481,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct task_group', + 'struct bpf_sched_cpu_stats', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5249add91b7c..1e91aea266ac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3784,6 +3784,12 @@ union bpf_attr { * Set tag to *tsk*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3954,7 @@ union bpf_attr { FN(sched_task_tag_of), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_stats_of), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From f333bd6882e79c7082fb9ce56022319be38f9ee1 Mon Sep 17 00:00:00 2001 From: Ren Zhijie Date: Tue, 29 Nov 2022 19:11:13 +0800 Subject: [PATCH 13/23] sched: programmable: Add helper function for cpu topology. hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add bpf helper function bpf_init_cpu_topology() which obtains cpu topology info through the macros topology_* that are defined by include/linux/topology.h, and save it in BPF MAP. The cpu topology info are useful to select core in userspace. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/bpf_topology.h | 46 ++++++++++++++++++ include/uapi/linux/bpf.h | 14 ++++++ kernel/sched/Makefile | 1 + kernel/sched/bpf_sched.c | 8 +++ kernel/sched/bpf_topology.c | 89 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 14 ++++++ 7 files changed, 176 insertions(+) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 000000000000..0c7ee492edde --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_cpumask_info { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6d0cdc618f2f..94f563c925ad 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3790,6 +3790,18 @@ union bpf_attr { * Get multiple types of *cpu* statistics and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3955,6 +3967,8 @@ union bpf_attr { FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ FN(sched_cpu_stats_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a67401cc4d5d..6f3106774d05 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -38,3 +38,4 @@ obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_topology.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 26954d74b0e9..cf53bead8313 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "sched.h" DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); @@ -26,6 +27,9 @@ BTF_SET_START(bpf_sched_hooks) #undef BPF_SCHED_HOOK BTF_SET_END(bpf_sched_hooks) +const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_cpumask_info_proto __weak; + int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -120,6 +124,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_trace_printk_proto(); case BPF_FUNC_sched_cpu_stats_of: return &bpf_sched_cpu_stats_of_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_cpumask_info: + return &bpf_get_cpumask_info_proto; default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 000000000000..a843aebd0c14 --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *cpu_topology, int cpu) +{ + cpu_topology->cpu = cpu; + cpu_topology->core_id = topology_core_id(cpu); + cpu_topology->cluster_id = topology_cluster_id(cpu); + cpu_topology->die_id = topology_die_id(cpu); + cpu_topology->physical_package_id = topology_physical_package_id(cpu); + cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_1(bpf_init_cpu_topology, struct bpf_map *, map) +{ + const struct cpumask *cpu_map = cpu_active_mask; + struct bpf_cpu_topology *topo; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + topo = map->ops->map_lookup_elem(map, &i); + if (!topo) + return -ENOMEM; + + bpf_update_cpu_topology(topo, i); + } + + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_cpu_topology_ids, struct, bpf_cpu_topology) + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +}; + +BPF_CALL_2(bpf_get_cpumask_info, struct bpf_map *, map, struct bpf_cpumask_info *, cpus) +{ + if (!cpus) + return -EINVAL; + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_cpumask_info_proto = { + .func = bpf_get_cpumask_info, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 383fea7d1d18..344ec8093e8b 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -437,6 +437,8 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct task_group', 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', ] known_types = { '...', @@ -482,6 +484,8 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct task_group', 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e91aea266ac..d111513b9241 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3790,6 +3790,18 @@ union bpf_attr { * Get multiple types of *cpu* statistics and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3955,6 +3967,8 @@ union bpf_attr { FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ FN(sched_cpu_stats_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 28b91df0178b108df4cc6ded1dfb68b8c39f5476 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:22 +0800 Subject: [PATCH 14/23] sched: programmable: Add convenient helper functions to convert sched entity hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add three helper functions: 1) bpf_sched_entity_is_task is to check whether the sched entity is a task struct. 2) bpf_sched_entity_to_task is to change the sched entity to a task struct. 3) bpf_sched_entity_to_tg is to change the sched entity to a task group. Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 21 +++++++++++ kernel/sched/bpf_sched.c | 69 ++++++++++++++++++++++++++++++++-- scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 21 +++++++++++ 4 files changed, 109 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94f563c925ad..f17d2c88981b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3802,6 +3802,24 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3969,6 +3987,9 @@ union bpf_attr { FN(sched_cpu_stats_of), \ FN(init_cpu_topology), \ FN(get_cpumask_info), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index cf53bead8313..602f34c29eba 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -116,6 +116,65 @@ static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { .arg3_type = ARG_CONST_SIZE, }; +BTF_ID_LIST_SINGLE(btf_sched_entity_ids, struct, sched_entity) +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -128,6 +187,12 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_init_cpu_topology_proto; case BPF_FUNC_get_cpumask_info: return &bpf_get_cpumask_info_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; default: return bpf_base_func_proto(func_id); } @@ -154,8 +219,6 @@ BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) return ret; } -BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) - const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .func = bpf_sched_tg_tag_of, .gpl_only = false, @@ -171,8 +234,6 @@ BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) return tsk->tag; } -BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) - const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .func = bpf_sched_task_tag_of, .gpl_only = false, diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 344ec8093e8b..1f2b1133f47a 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -439,6 +439,7 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_stats', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct sched_entity', ] known_types = { '...', @@ -486,6 +487,7 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_stats', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct sched_entity', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d111513b9241..145b21c2b56a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3802,6 +3802,24 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3969,6 +3987,9 @@ union bpf_attr { FN(sched_cpu_stats_of), \ FN(init_cpu_topology), \ FN(get_cpumask_info), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 2bc344acbb2b930cac3e6b9c555b586b6d140d58 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:23 +0800 Subject: [PATCH 15/23] bpf:programmable: Add cpumask ops collection hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add cpumask ops collection, such as cpumask_empty, cpumask_and, cpumask_andnot, cpumask_subset, cpumask_equal, cpumask_copy. Signed-off-by: Chen Hui Signed-off-by: Hui Tang --- include/linux/sched.h | 23 +++++++++ include/uapi/linux/bpf.h | 43 +++++++++++++++++ kernel/sched/bpf_sched.c | 87 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++ 5 files changed, 200 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1bccbf4040b3..a61f0a00f43a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2241,5 +2241,28 @@ struct bpf_sched_cpu_stats { unsigned long capacity_orig; }; +struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; + #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f17d2c88981b..f5569885f5d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3820,6 +3820,48 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3990,6 +4032,7 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(cpumask_op), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 602f34c29eba..e33eea10373d 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -175,6 +175,91 @@ static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { .arg1_btf_id = &btf_sched_entity_ids[0], }; +BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -193,6 +278,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_task_proto; case BPF_FUNC_sched_entity_to_tg: return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 1f2b1133f47a..e6bf1c47e25f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -440,6 +440,8 @@ class PrinterHelpers(Printer): 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', ] known_types = { '...', @@ -488,6 +490,8 @@ class PrinterHelpers(Printer): 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 145b21c2b56a..6a25880e9e6b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3820,6 +3820,48 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3990,6 +4032,7 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(cpumask_op), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From 37aa04a66c340e6d46e8a8c017866af422d067c9 Mon Sep 17 00:00:00 2001 From: Hui Tang Date: Fri, 25 Nov 2022 11:56:24 +0800 Subject: [PATCH 16/23] bpf:programmable: Add helper func to check cpu share cache hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add helper function to check two cpu whehter share same LLC cache. Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 33 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f5569885f5d3..561d0f2fb58d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3862,6 +3862,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * yes 1, no 0. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4033,6 +4039,7 @@ union bpf_attr { FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index e33eea10373d..220ba83fc5f4 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -260,6 +260,23 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) +{ + if ((unsigned int)src_cpu >= nr_cpu_ids || + (unsigned int)dst_cpu >= nr_cpu_ids) + return 0; + + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -280,6 +297,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_tg_proto; case BPF_FUNC_cpumask_op: return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6a25880e9e6b..dc6263c41238 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3862,6 +3862,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4033,6 +4039,7 @@ union bpf_attr { FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- Gitee From ee61845bf07126bb6b90b42b07c9a21403c97faa Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Nov 2022 12:04:00 +0800 Subject: [PATCH 17/23] sched: cfs: add bpf hooks to control wakeup and tick preemption maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/ ------------------- This patch adds 3 hooks to control wakeup and tick preemption: cfs_check_preempt_tick cfs_check_preempt_wakeup cfs_wakeup_preempt_entity The first one allows to force or suppress a preemption from a tick context. An obvious usage example is to minimize the number of non-voluntary context switches and decrease an associated latency penalty by (conditionally) providing tasks or task groups an extended execution slice. It can be used instead of tweaking sysctl_sched_min_granularity. The second one is called from the wakeup preemption code and allows to redefine whether a newly woken task should preempt the execution of the current task. This is useful to minimize a number of preemptions of latency sensitive tasks. To some extent it's a more flexible analog of a sysctl_sched_wakeup_granularity. The third one is similar, but it tweaks the wakeup_preempt_entity() function, which is called not only from a wakeup context, but also from pick_next_task(), which allows to influence the decision on which task will be running next. It's a place for a discussion whether we need both these hooks or only one of them: the second is more powerful, but depends more on the current implementation. In any case, bpf hooks are not an ABI, so it's not a deal breaker. The idea of the wakeup_preempt_entity hook belongs to Rik van Riel. He also contributed a lot to the whole patchset by proving his ideas, recommendations and a feedback for earlier (non-public) versions. Signed-off-by: Roman Gushchin Signed-off-by: Chen Hui Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/sched_hook_defs.h | 5 ++++- kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 14344004e335..e2f65e4b8895 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -1,2 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -BPF_SCHED_HOOK(int, 0, dummy, void) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, + struct sched_entity *se) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe2f527c71ed..9eb9a77781d3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,7 @@ #include #include #endif +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -4434,6 +4435,21 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_tick(curr, delta_exec); + + if (ret < 0) + return; + else if (ret > 0) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, curr); + return; + } + } +#endif + if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -7015,6 +7031,15 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_wakeup_preempt_entity(curr, se); + + if (ret) + return ret; + } +#endif + if (vdiff <= 0) return -1; @@ -7101,6 +7126,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ likely(!task_has_idle_policy(p))) goto preempt; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_wakeup(current, p); + + if (ret < 0) + return; + else if (ret > 0) + goto preempt; + } +#endif + /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): -- Gitee From 324e7c64f74b2073921d162c7cca2bf0f82c4194 Mon Sep 17 00:00:00 2001 From: Guan Jing Date: Fri, 25 Nov 2022 11:56:26 +0800 Subject: [PATCH 18/23] sched: programmable: Add hook for pick next task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- This hook point can change the position of se on the red-black tree, eg: in cloud scenarios, there will be online tasks that need to respond in time and offline tasks that do not need to respond in time. This hook point provides users with a way to customize that Class tasks run first. The basis for pick next task comes from system information, such as the red-black tree, and so on… If the system information of the CFS is modified, it will affect the whole system. Therefore, the hook function is added here. Only the position of the task on the red-black tree is modified, and the value of vruntime is not changed. Signed-off-by: Guan Jing Signed-off-by: Hui Tang --- include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2f65e4b8895..f1e19d8ff735 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -3,3 +3,5 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsign BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, + struct sched_entity *next) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9eb9a77781d3..eaec5febfd45 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -509,6 +509,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + + if (ret == 1) + return 1; + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; } -- Gitee From 8085949faf0a5ec22db5426403c405a5ee967081 Mon Sep 17 00:00:00 2001 From: Ren Zhijie Date: Fri, 25 Nov 2022 11:56:27 +0800 Subject: [PATCH 19/23] sched: programmable: add bpf hooks to update rq and task state in enqueue_task/deqeue_task of CFS Add 2 hooks to update rq and task state in enqueue_task_fair() and dequeue_task_fair(): cfs_enqueue_task cfs_dequeue_task When entering and leaving the runqeue, these hooks use to get and update context of interest to bpf prog after the runqueue completes the status change. Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index f1e19d8ff735..2b598f2053d5 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -5,3 +5,5 @@ BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, struct sched_entity *next) +BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eaec5febfd45..4206bb6117bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5703,6 +5703,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) assert_list_leaf_cfs_rq(rq); hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_enqueue_task(rq, p); +#endif } static void set_next_buddy(struct sched_entity *se); @@ -5777,6 +5782,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_dequeue_task(rq, p); +#endif } #ifdef CONFIG_SMP -- Gitee From 0abedfba7a86c53937d53c5f154fdc761b705f80 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:28 +0800 Subject: [PATCH 20/23] sched: programmable: Add three hooks in select_task_rq_fair() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add three hooks of sched type in select_task_rq_fair(), as follows: 'cfs_select_rq' Replace the original core selection policy or implement dynamic CPU affinity. 'cfs_select_rq_exit' Restoring the CPU affinity of the task before exiting of 'select_task_rq_fair'. To be used with 'cfs_select_rq' hook to implement dynamic CPU affinity. 'cfs_wake_affine' Determine on which CPU task can run soonest. Allow user to implement deferent policies. Signed-off-by: Chen Hui Signed-off-by: Hui Tang --- include/linux/sched.h | 33 ++++++++++++++++++ include/linux/sched_hook_defs.h | 3 ++ kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 +++ 4 files changed, 101 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index a61f0a00f43a..60749a11253a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2239,6 +2239,11 @@ struct bpf_sched_cpu_stats { /* capacity */ unsigned long capacity; unsigned long capacity_orig; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) }; struct cpumask_op_args { @@ -2264,5 +2269,33 @@ enum cpumask_op_type { CPUMASK_CPULIST_PARSE }; +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; #endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 2b598f2053d5..818b1244a018 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -7,3 +7,6 @@ BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, struct sched_entity *next) BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4206bb6117bb..43881fe8f959 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6002,6 +6002,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync); @@ -6874,6 +6890,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + cpumask_t *cpus_prev = NULL; + cpumask_t *cpus; + int ret; +#endif time = schedstat_start_time(); @@ -6895,6 +6917,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } else if (ret != -1) { + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + if (cpumask_subset(cpus, p->cpus_ptr) && + !cpumask_empty(cpus)) { + cpus_prev = (void *)p->cpus_ptr; + p->cpus_ptr = cpus; + } + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -6926,6 +6974,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (want_affine) current->recent_used_cpu = cpu; } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + + if (cpus_prev) + p->cpus_ptr = cpus_prev; + } +#endif + rcu_read_unlock(); schedstat_end_time(cpu_rq(cpu), time); diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index e6bf1c47e25f..fc51d6f0d447 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -442,6 +442,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', ] known_types = { '...', @@ -492,6 +494,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', } mapped_types = { 'u8': '__u8', -- Gitee From b741025dc64d2c16498e2823effe5b27d779c52f Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 25 Nov 2022 11:56:29 +0800 Subject: [PATCH 21/23] sched: programmable: Add lib for sched programmable hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA -------------------------------- Add lib for sched programmable, this functions help user program more easily. The main functions are as follows: 1.Wrap the helper functions make more easily to use. 2.Implement some generic methods and policies for scheduler. Signed-off-by: Chen Hui Signed-off-by: Hui Tang --- tools/lib/bpf/libbpf_sched.h | 510 +++++++++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 tools/lib/bpf/libbpf_sched.h diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 000000000000..b2008ff6bb25 --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,510 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include +#include +#include +#include +#include + +/* set bigger value may lead verifier failed */ +#define BPF_SCHED_LOOP_MAX 1024 +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1,\ + (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start),\ + true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)),\ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct bpf_cpumask_info); + __uint(max_entries, 1); +} map_cpumask_info SEC(".maps"); + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, + struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, + struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) +{ + int se_tag = 0; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} +#endif -- Gitee From 5a36c0b232ef6aa7fec057b45dda4c4e363b29f6 Mon Sep 17 00:00:00 2001 From: Ren Zhijie Date: Fri, 25 Nov 2022 11:56:30 +0800 Subject: [PATCH 22/23] openeuler_defconfig: enable CONFIG_BPF_SCHED for x86 hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA -------------------------------- Enable CONFIG_BPF_SCHED for x86 by default. Signed-off-by: Ren Zhijie Signed-off-by: Hui Tang --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 300444226d6a..81909a59d98d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -235,6 +235,7 @@ CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_KALLSYMS_BASE_RELATIVE=y # CONFIG_BPF_LSM is not set +CONFIG_BPF_SCHED=y CONFIG_BPF_SYSCALL=y CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y -- Gitee From 503dd306436a306cdf3f8613ee5325adfacfccdc Mon Sep 17 00:00:00 2001 From: Jialin Zhang Date: Mon, 28 Nov 2022 19:03:41 +0800 Subject: [PATCH 23/23] kabi: test fix kabi for enum bpf_prog_type and bpf_attach_type BPF_SCHED and BPF_PROG_TYPE_SCHED breaks the KABI of enum bpf_prog_type and bpf_attach_type. This patch uses KABI_EXTEND_ENUM and KABI_BROKEN_INSERT_ENUM to fix it. Signed-off-by: Jialin Zhang Signed-off-by: Hui Tang --- include/uapi/linux/bpf.h | 5 +++-- tools/include/uapi/linux/bpf.h | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 561d0f2fb58d..374250bcf9d1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -10,6 +10,7 @@ #include #include +#include /* Extended instruction set based on top of classic BPF */ @@ -199,7 +200,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, - BPF_PROG_TYPE_SCHED, + KABI_EXTEND_ENUM(BPF_PROG_TYPE_SCHED) }; enum bpf_attach_type { @@ -241,7 +242,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, - BPF_SCHED, + KABI_BROKEN_INSERT_ENUM(BPF_SCHED) __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dc6263c41238..abf8023d606b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -199,7 +199,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, +#ifndef __GENKSYMS__ BPF_PROG_TYPE_SCHED, +#endif }; enum bpf_attach_type { @@ -241,7 +243,9 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, +#ifndef __GENKSYMS__ BPF_SCHED, +#endif __MAX_BPF_ATTACH_TYPE }; -- Gitee