From 270f49a9dbe66d44b1ad9bab75f02b3c45e518be Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:39 -0800 Subject: [PATCH 01/23] bpf: enable program stats ANBZ: #5530 commit 492ecee892c2a4ba6a14903d5d586ff750b7e805 upstream. JITed BPF programs are indistinguishable from kernel functions, but unlike kernel code BPF code can be changed often. Typical approach of "perf record" + "perf report" profiling and tuning of kernel code works just as well for BPF programs, but kernel code doesn't need to be monitored whereas BPF programs do. Users load and run large amount of BPF programs. These BPF stats allow tools monitor the usage of BPF on the server. The monitoring tools will turn sysctl kernel.bpf_stats_enabled on and off for few seconds to sample average cost of the programs. Aggregated data over hours and days will provide an insight into cost of BPF and alarms can trigger in case given program suddenly gets more expensive. The cost of two sched_clock() per program invocation adds ~20 nsec. Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down from ~10 nsec to ~30 nsec. static_key minimizes the cost of the stats collection. There is no measurable difference before/after this patch with kernel.bpf_stats_enabled=0 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 9 +++++++++ include/linux/filter.h | 20 +++++++++++++++++++- kernel/bpf/core.c | 31 +++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 7 ++++++- kernel/sysctl.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4bbc1dc3386..832088d0bc1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include #include #include +#include struct bpf_verifier_env; struct perf_event; @@ -345,6 +346,12 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + struct u64_stats_sync syncp; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -394,6 +401,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; @@ -570,6 +578,7 @@ void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; +extern int sysctl_bpf_stats_enabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index dc3c1044f247..59f15a2aeee1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -563,7 +563,24 @@ struct sk_filter { struct bpf_prog *prog; }; -#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); }) +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); + +#define BPF_PROG_RUN(prog, ctx) ({ \ + u32 ret; \ + cant_sleep(); \ + if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ + struct bpf_prog_stats *stats; \ + u64 start = sched_clock(); \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&stats->syncp); \ + stats->cnt++; \ + stats->nsecs += sched_clock() - start; \ + u64_stats_update_end(&stats->syncp); \ + } else { \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + } \ + ret; }) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN @@ -796,6 +813,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog); void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0d1586d4f3a0..6db62aa6d928 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; @@ -104,6 +104,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return fp; } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + struct bpf_prog *prog; + + prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); + if (!prog) + return NULL; + + prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->aux->stats) { + kfree(prog->aux); + vfree(prog); + return NULL; + } + + u64_stats_init(&prog->aux->stats->syncp); + return prog; +} EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) @@ -231,7 +251,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->aux); + if (fp->aux) { + free_percpu(fp->aux->stats); + kfree(fp->aux); + } vfree(fp); } @@ -2067,6 +2090,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); +int sysctl_bpf_stats_enabled __read_mostly; + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ddd79726778..5f616df11ff1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1291,24 +1291,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +static void bpf_prog_get_stats(const struct bpf_prog *prog, + struct bpf_prog_stats *stats) +{ + u64 nsecs = 0, cnt = 0; + int cpu; + + for_each_possible_cpu(cpu) { + const struct bpf_prog_stats *st; + unsigned int start; + u64 tnsecs, tcnt; + + st = per_cpu_ptr(prog->aux->stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&st->syncp); + tnsecs = st->nsecs; + tcnt = st->cnt; + } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + nsecs += tnsecs; + cnt += tcnt; + } + stats->nsecs = nsecs; + stats->cnt = cnt; +} + #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + struct bpf_prog_stats stats; + bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" - "prog_id:\t%u\n", + "prog_id:\t%u\n" + "run_time_ns:\t%llu\n" + "run_cnt:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, - prog->aux->id); + prog->aux->id, + stats.nsecs, + stats.cnt); } #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 148c41290c32..c283210c53c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7532,7 +7532,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + /* BPF_PROG_RUN doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->aux->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0b6dc1f8fdda..827ca50fecc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -240,6 +240,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -1413,6 +1416,15 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif + { + .procname = "bpf_stats_enabled", + .data = &sysctl_bpf_stats_enabled, + .maxlen = sizeof(sysctl_bpf_stats_enabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_bpf_stats, + .extra1 = &zero, + .extra2 = &one, + }, #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { .procname = "panic_on_rcu_stall", @@ -3738,6 +3750,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, bpf_stats = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &bpf_stats; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + *(int *)table->data = bpf_stats; + if (bpf_stats) + static_branch_enable(&bpf_stats_enabled_key); + else + static_branch_disable(&bpf_stats_enabled_key); + } + return ret; +} + /* * No sense putting this after each symbol definition, twice, * exception granted :-) -- Gitee From c45c5b53a3effb7db12162f36f58a650d50bf7f8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:40 -0800 Subject: [PATCH 02/23] bpf: expose program stats via bpf_prog_info ANBZ: #5530 commit 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 upstream. Return bpf program run_time_ns and run_cnt via bpf_prog_info Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 991b92610531..d6b8a1dc60e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2814,6 +2814,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5f616df11ff1..2c2ee3b61884 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2171,6 +2171,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info = {}; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2210,6 +2211,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (err) return err; + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; -- Gitee From 94bc38da7c5e36a22146086c44b0bd3cc047349c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:41 -0800 Subject: [PATCH 03/23] tools/bpf: sync bpf.h into tools ANBZ: #5530 commit b1eca86db68b48f59b9f195c8b4f8114d2a9918c upstream. sync bpf.h into tools directory Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a837314ae674..73a4b9a51b95 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2809,6 +2809,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { -- Gitee From c2e95c5594c1385cfbcf26200d60f336370a74e5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:42 -0800 Subject: [PATCH 04/23] tools/bpftool: recognize bpf_prog_info run_time_ns and run_cnt ANBZ: #5530 commit 88ad472b8a4ad2292d11835652462fd9f745245e upstream. $ bpftool p s 1: kprobe tag a56587d488d216c9 gpl run_time_ns 79786 run_cnt 8 loaded_at 2019-02-22T12:22:51-0800 uid 0 xlated 352B not jited memlock 4096B $ bpftool --json --pretty p s [{ "id": 1, "type": "kprobe", "tag": "a56587d488d216c9", "gpl_compatible": true, "run_time_ns": 79786, "run_cnt": 8, "loaded_at": 1550866971, "uid": 0, "bytes_xlated": 352, "jited": false, "bytes_memlock": 4096 } ] Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 +++- tools/bpf/bpftool/prog.c | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 12bc1e2d4b46..9386bd6e0396 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -171,7 +171,7 @@ EXAMPLES :: - 10: xdp name some_prog tag 005a3d2123620c8b gpl + 10: xdp name some_prog tag 005a3d2123620c8b gpl run_time_ns 81632 run_cnt 10 loaded_at 2017-09-29T20:11:00+0000 uid 0 xlated 528B jited 370B memlock 4096B map_ids 10 @@ -184,6 +184,8 @@ EXAMPLES "type": "xdp", "tag": "005a3d2123620c8b", "gpl_compatible": true, + "run_time_ns": 81632, + "run_cnt": 10, "loaded_at": 1506715860, "uid": 0, "bytes_xlated": 528, diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 759c4ea9b56e..9027554a2d37 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) info->tag[4], info->tag[5], info->tag[6], info->tag[7]); jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible); + if (info->run_time_ns) { + jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); + jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); + } print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); @@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) fprint_hex(stdout, info->tag, BPF_TAG_SIZE, ""); print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); printf("%s", info->gpl_compatible ? " gpl" : ""); + if (info->run_time_ns) + printf(" run_time_ns %lld run_cnt %lld", + info->run_time_ns, info->run_cnt); printf("\n"); if (info->load_time) { -- Gitee From a281928e6f31a605f5375849503f6b2c9fdc0773 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 Feb 2019 18:30:44 -0800 Subject: [PATCH 05/23] bpf: fix build without bpf_syscall ANBZ: #5530 commit 3fcc5530bcb2a879e32bd940e6eafee328ac3647 upstream. wrap bpf_stats_enabled sysctl with #ifdef Reported-by: Stephen Rothwell Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/sysctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 827ca50fecc5..d8015709ffb2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -240,9 +240,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_BPF_SYSCALL static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -1415,7 +1417,6 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, -#endif { .procname = "bpf_stats_enabled", .data = &sysctl_bpf_stats_enabled, @@ -1425,6 +1426,7 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { .procname = "panic_on_rcu_stall", @@ -3750,6 +3752,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -3771,7 +3774,7 @@ static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, } return ret; } - +#endif /* * No sense putting this after each symbol definition, twice, * exception granted :-) -- Gitee From 91d9cf7e1234f2831141c53e3deab5cf818af1ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Mar 2019 14:33:11 -0800 Subject: [PATCH 06/23] bpf: fix u64_stats_init() usage in bpf_prog_alloc() ANBZ: #5530 commit 4b9113045b1745ec8512d6743680809edca6a74e upstream. We need to iterate through all possible cpus. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6db62aa6d928..594dd9fe6d51 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -109,6 +109,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; + int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) @@ -121,7 +122,12 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return NULL; } - u64_stats_init(&prog->aux->stats->syncp); + for_each_possible_cpu(cpu) { + struct bpf_prog_stats *pstats; + + pstats = per_cpu_ptr(prog->aux->stats, cpu); + u64_stats_init(&pstats->syncp); + } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); -- Gitee From f3d6a9d6821f7a81bc8354f99d6259e20b23fd9e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 21:34:12 +0100 Subject: [PATCH 07/23] bpf: fix sysctl.c warning ANBZ: #5530 commit 78c3aff834f7939d1a2570e366e33f2a880d4d9e upstream. When CONFIG_BPF_SYSCALL or CONFIG_SYSCTL is disabled, we get a warning about an unused function: kernel/sysctl.c:3331:12: error: 'proc_dointvec_minmax_bpf_stats' defined but not used [-Werror=unused-function] static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, The CONFIG_BPF_SYSCALL check was already handled, but the SYSCTL check is needed on top. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d8015709ffb2..803c16c30bd7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3752,7 +3752,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- Gitee From a23aa009990da1c196a235732587105486601404 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 27 Feb 2019 11:08:06 -0500 Subject: [PATCH 08/23] bpf: add missing entries to bpf_helpers.h ANBZ: #5530 commit f2bb53887eb3e8f859ac7cfc09d1a3801492c009 upstream. This header defines the BPF functions enumerated in uapi/linux.bpf.h in a callable format. Expand to include all registered functions. Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/bpf_helpers.h | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index dc94528078d3..a908dd501bdd 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -230,6 +230,36 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = (void *) BPF_FUNC_skb_change_head; static int (*bpf_skb_pull_data)(void *, int len) = (void *) BPF_FUNC_skb_pull_data; +static unsigned int (*bpf_get_cgroup_classid)(void *ctx) = + (void *) BPF_FUNC_get_cgroup_classid; +static unsigned int (*bpf_get_route_realm)(void *ctx) = + (void *) BPF_FUNC_get_route_realm; +static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) = + (void *) BPF_FUNC_skb_change_proto; +static int (*bpf_skb_change_type)(void *ctx, __u32 type) = + (void *) BPF_FUNC_skb_change_type; +static unsigned int (*bpf_get_hash_recalc)(void *ctx) = + (void *) BPF_FUNC_get_hash_recalc; +static unsigned long long (*bpf_get_current_task)(void *ctx) = + (void *) BPF_FUNC_get_current_task; +static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) = + (void *) BPF_FUNC_skb_change_tail; +static long long (*bpf_csum_update)(void *ctx, __u32 csum) = + (void *) BPF_FUNC_csum_update; +static void (*bpf_set_hash_invalid)(void *ctx) = + (void *) BPF_FUNC_set_hash_invalid; +static int (*bpf_get_numa_node_id)(void) = + (void *) BPF_FUNC_get_numa_node_id; +static int (*bpf_probe_read_str)(void *ctx, __u32 size, + const void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read_str; +static unsigned int (*bpf_get_socket_uid)(void *ctx) = + (void *) BPF_FUNC_get_socket_uid; +static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) = + (void *) BPF_FUNC_set_hash; +static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, + unsigned long long flags) = + (void *) BPF_FUNC_skb_adjust_room; /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) -- Gitee From ec449d797f18d3656f1b6069e4e83da696b40ddc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Feb 2019 19:04:12 -0800 Subject: [PATCH 09/23] tools: libbpf: add a correctly named define for map iteration ANBZ: #5530 commit f74a53d9a567f6bc6f6d8460e84c76bd2a45d016 upstream. For historical reasons the helper to loop over maps in an object is called bpf_map__for_each while it really should be called bpf_object__for_each_map. Rename and add a correctly named define for backward compatibility. Switch all in-tree users to the correct name (Quentin). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/prog.c | 4 ++-- tools/lib/bpf/libbpf.c | 8 ++++---- tools/lib/bpf/libbpf.h | 3 ++- tools/perf/util/bpf-loader.c | 4 ++-- tools/testing/selftests/bpf/test_libbpf_open.c | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9027554a2d37..9d3a891a34ad 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1057,7 +1057,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) j = 0; while (j < old_map_fds && map_replace[j].name) { i = 0; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!strcmp(bpf_map__name(map), map_replace[j].name)) { map_replace[j].idx = i; break; @@ -1078,7 +1078,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) /* Set ifindex and name reuse */ j = 0; idx = 0; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!bpf_map__is_offload_neutral(map)) bpf_map__set_ifindex(map, ifindex); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b6e9d22ee0ae..638c6bd89b84 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2109,7 +2109,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) if (err) return err; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { char buf[PATH_MAX]; int len; @@ -2156,7 +2156,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) if (!obj) return -ENOENT; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { char buf[PATH_MAX]; int len; @@ -2844,7 +2844,7 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name) { struct bpf_map *pos; - bpf_map__for_each(pos, obj) { + bpf_object__for_each_map(pos, obj) { if (pos->name && !strcmp(pos->name, name)) return pos; } @@ -2937,7 +2937,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, first_prog = prog; } - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!bpf_map__is_offload_neutral(map)) map->map_ifindex = attr->ifindex; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0409781c44cc..aa1521a51687 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -279,10 +279,11 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); LIBBPF_API struct bpf_map * bpf_map__next(struct bpf_map *map, struct bpf_object *obj); -#define bpf_map__for_each(pos, obj) \ +#define bpf_object__for_each_map(pos, obj) \ for ((pos) = bpf_map__next(NULL, (obj)); \ (pos) != NULL; \ (pos) = bpf_map__next((pos), (obj))) +#define bpf_map__for_each bpf_object__for_each_map LIBBPF_API struct bpf_map * bpf_map__prev(struct bpf_map *map, struct bpf_object *obj); diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 30f28daaf51a..0e03c4e9351a 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -1489,7 +1489,7 @@ apply_obj_config_object(struct bpf_object *obj) struct bpf_map *map; int err; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { err = apply_obj_config_map(map); if (err) return err; @@ -1513,7 +1513,7 @@ int bpf__apply_obj_config(void) #define bpf__for_each_map(pos, obj, objtmp) \ bpf_object__for_each_safe(obj, objtmp) \ - bpf_map__for_each(pos, obj) + bpf_object__for_each_map(pos, obj) #define bpf__for_each_map_named(pos, obj, objtmp, name) \ bpf__for_each_map(pos, obj, objtmp) \ diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c index c413fb96afef..9e9db202d218 100644 --- a/tools/testing/selftests/bpf/test_libbpf_open.c +++ b/tools/testing/selftests/bpf/test_libbpf_open.c @@ -69,7 +69,7 @@ int test_walk_maps(struct bpf_object *obj, bool verbose) struct bpf_map *map; int cnt = 0; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { cnt++; if (verbose) printf("Map (count:%d) name: %s\n", cnt, -- Gitee From 04448655f37da3de33ff2cac1e4378ac9966d5b5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 15:31:22 -0800 Subject: [PATCH 10/23] libbpf: fix formatting for btf_ext__get_raw_data ANBZ: #5530 commit 1baabdc1089eb807cdcabebad50b36c8b9895a48 upstream. Fix invalid formatting of pointer arg. Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/lib/bpf/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 94bbc249b0f1..b60bb7cf5fff 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -76,7 +76,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); -LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext* btf_ext, +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ext, -- Gitee From 87b6c26ddbf2da3b04f164ac75c897966e740a40 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 15:31:23 -0800 Subject: [PATCH 11/23] btf: allow to customize dedup hash table size ANBZ: #5530 commit 51edf5f6e015c48b62e24ab2fbcad8885ca1c74e upstream. Default size of dedup table (16k) is good enough for most binaries, even typical vmlinux images. But there are cases of binaries with huge amount of BTF types (e.g., allyesconfig variants of kernel), which benefit from having bigger dedup table size to lower amount of unnecessary hash collisions. Tools like pahole, thus, can tune this parameter to reach optimal performance. This change also serves double purpose of allowing tests to force hash collisions to test some corner cases, used in follow up patch. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/lib/bpf/btf.c | 53 ++++++++++++++++++++++++++++++--------------- tools/lib/bpf/btf.h | 1 + 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 9dc1c1cccf63..2b179c68722d 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1069,8 +1069,8 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, return err; } -#define BTF_DEDUP_TABLE_SIZE_LOG 14 -#define BTF_DEDUP_TABLE_MOD ((1 << BTF_DEDUP_TABLE_SIZE_LOG) - 1) +#define BTF_DEDUP_TABLE_DEFAULT_SIZE (1 << 14) +#define BTF_DEDUP_TABLE_MAX_SIZE_LOG 31 #define BTF_UNPROCESSED_ID ((__u32)-1) #define BTF_IN_PROGRESS_ID ((__u32)-2) @@ -1127,18 +1127,21 @@ static inline __u32 hash_combine(__u32 h, __u32 value) #undef GOLDEN_RATIO_PRIME } -#define for_each_hash_node(table, hash, node) \ - for (node = table[hash & BTF_DEDUP_TABLE_MOD]; node; node = node->next) +#define for_each_dedup_cand(d, hash, node) \ + for (node = d->dedup_table[hash & (d->opts.dedup_table_size - 1)]; \ + node; \ + node = node->next) static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id) { struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node)); + int bucket = hash & (d->opts.dedup_table_size - 1); if (!node) return -ENOMEM; node->type_id = type_id; - node->next = d->dedup_table[hash & BTF_DEDUP_TABLE_MOD]; - d->dedup_table[hash & BTF_DEDUP_TABLE_MOD] = node; + node->next = d->dedup_table[bucket]; + d->dedup_table[bucket] = node; return 0; } @@ -1176,7 +1179,7 @@ static void btf_dedup_table_free(struct btf_dedup *d) if (!d->dedup_table) return; - for (i = 0; i < (1 << BTF_DEDUP_TABLE_SIZE_LOG); i++) { + for (i = 0; i < d->opts.dedup_table_size; i++) { while (d->dedup_table[i]) { tmp = d->dedup_table[i]; d->dedup_table[i] = tmp->next; @@ -1211,19 +1214,37 @@ static void btf_dedup_free(struct btf_dedup *d) free(d); } +/* Find closest power of two >= to size, capped at 2^max_size_log */ +static __u32 roundup_pow2_max(__u32 size, int max_size_log) +{ + int i; + + for (i = 0; i < max_size_log && (1U << i) < size; i++) + ; + return 1U << i; +} + + static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, const struct btf_dedup_opts *opts) { struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); int i, err = 0; + __u32 sz; if (!d) return ERR_PTR(-ENOMEM); + d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds; + sz = opts && opts->dedup_table_size ? opts->dedup_table_size + : BTF_DEDUP_TABLE_DEFAULT_SIZE; + sz = roundup_pow2_max(sz, BTF_DEDUP_TABLE_MAX_SIZE_LOG); + d->opts.dedup_table_size = sz; + d->btf = btf; d->btf_ext = btf_ext; - d->dedup_table = calloc(1 << BTF_DEDUP_TABLE_SIZE_LOG, + d->dedup_table = calloc(d->opts.dedup_table_size, sizeof(struct btf_dedup_node *)); if (!d->dedup_table) { err = -ENOMEM; @@ -1248,8 +1269,6 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, for (i = 0; i <= btf->nr_types; i++) d->hypot_map[i] = BTF_UNPROCESSED_ID; - d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds; - done: if (err) { btf_dedup_free(d); @@ -1823,7 +1842,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_INT: h = btf_hash_int(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_int(t, cand)) { new_id = cand_node->type_id; @@ -1834,7 +1853,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_ENUM: h = btf_hash_enum(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_enum(t, cand)) { new_id = cand_node->type_id; @@ -1845,7 +1864,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_FWD: h = btf_hash_common(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_common(t, cand)) { new_id = cand_node->type_id; @@ -2262,7 +2281,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) return 0; h = btf_hash_struct(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { int eq; btf_dedup_clear_hypot_map(d); @@ -2349,7 +2368,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) t->type = ref_type_id; h = btf_hash_common(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_common(t, cand)) { new_id = cand_node->type_id; @@ -2372,7 +2391,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) info->index_type = ref_type_id; h = btf_hash_array(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_array(t, cand)) { new_id = cand_node->type_id; @@ -2403,7 +2422,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) } h = btf_hash_fnproto(t); - for_each_hash_node(d->dedup_table, h, cand_node) { + for_each_dedup_cand(d, h, cand_node) { cand = d->btf->types[cand_node->type_id]; if (btf_equal_fnproto(t, cand)) { new_id = cand_node->type_id; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index b60bb7cf5fff..28a1e1e59861 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -90,6 +90,7 @@ LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); struct btf_dedup_opts { + unsigned int dedup_table_size; bool dont_resolve_fwds; }; -- Gitee From 8a8aa2467fb3f606f4f80d0fc4ff95327039dcb2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 15:31:24 -0800 Subject: [PATCH 12/23] btf: fix bug with resolving STRUCT/UNION into corresponding FWD ANBZ: #5530 commit 91097fbee4c025ac72f91ae41feba3a822cc1316 upstream. When checking available canonical candidates for struct/union algorithm utilizes btf_dedup_is_equiv to determine if candidate is suitable. This check is not enough when candidate is corresponding FWD for that struct/union, because according to equivalence logic they are equivalent. When it so happens that FWD and STRUCT/UNION end in hashing to the same bucket, it's possible to create remapping loop from FWD to STRUCT and STRUCT to same FWD, which will cause btf_dedup() to loop forever. This patch fixes the issue by additionally checking that type and canonical candidate are strictly equal (utilizing btf_equal_struct). Fixes: d5caef5b5655 ("btf: add BTF types deduplication algorithm") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/lib/bpf/btf.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2b179c68722d..8aeb80d73285 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1662,7 +1662,7 @@ static __u32 btf_hash_struct(struct btf_type *t) * IDs. This check is performed during type graph equivalence check and * referenced types equivalence is checked separately. */ -static bool btf_equal_struct(struct btf_type *t1, struct btf_type *t2) +static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) { struct btf_member *m1, *m2; __u16 vlen; @@ -2123,7 +2123,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, struct btf_member *cand_m, *canon_m; __u16 vlen; - if (!btf_equal_struct(cand_type, canon_type)) + if (!btf_shallow_equal_struct(cand_type, canon_type)) return 0; vlen = BTF_INFO_VLEN(cand_type->info); cand_m = (struct btf_member *)(cand_type + 1); @@ -2264,7 +2264,7 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) { struct btf_dedup_node *cand_node; - struct btf_type *t; + struct btf_type *cand_type, *t; /* if we don't find equivalent type, then we are canonical */ __u32 new_id = type_id; __u16 kind; @@ -2284,6 +2284,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) for_each_dedup_cand(d, h, cand_node) { int eq; + /* + * Even though btf_dedup_is_equiv() checks for + * btf_shallow_equal_struct() internally when checking two + * structs (unions) for equivalence, we need to guard here + * from picking matching FWD type as a dedup candidate. + * This can happen due to hash collision. In such case just + * relying on btf_dedup_is_equiv() would lead to potentially + * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because + * FWD and compatible STRUCT/UNION are considered equivalent. + */ + cand_type = d->btf->types[cand_node->type_id]; + if (!btf_shallow_equal_struct(t, cand_type)) + continue; + btf_dedup_clear_hypot_map(d); eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id); if (eq < 0) -- Gitee From e8406d8a4cddafcb76761e006c33563ee71b4ea3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 15:31:25 -0800 Subject: [PATCH 13/23] selftests/bpf: add btf_dedup test of FWD/STRUCT resolution ANBZ: #5530 commit 7c7a4890c87dd2eb77ef144e5153a7df4c0d6f53 upstream. This patch adds a btf_dedup test exercising logic of STRUCT<->FWD resolution and validating that STRUCT is not resolved to a FWD. It also forces hash collisions, forcing both FWD and STRUCT to be candidates for each other. Previously this condition caused infinite loop due to FWD pointing to STRUCT and STRUCT pointing to its FWD. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/test_btf.c | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 591c9b40dae6..f01006873b09 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -5731,6 +5731,51 @@ const struct btf_dedup_test dedup_tests[] = { .dont_resolve_fwds = false, }, }, +{ + .descr = "dedup: struct <-> fwd resolution w/ hash collision", + /* + * // CU 1: + * struct x; + * struct s { + * struct x *x; + * }; + * // CU 2: + * struct x {}; + * struct s { + * struct x *x; + * }; + */ + .input = { + .raw_types = { + /* CU 1 */ + BTF_FWD_ENC(NAME_TBD, 0 /* struct fwd */), /* [1] fwd x */ + BTF_PTR_ENC(1), /* [2] ptr -> [1] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] struct s */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + /* CU 2 */ + BTF_STRUCT_ENC(NAME_TBD, 0, 0), /* [4] struct x */ + BTF_PTR_ENC(4), /* [5] ptr -> [4] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [6] struct s */ + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0x\0s\0x\0x\0s\0x\0"), + }, + .expect = { + .raw_types = { + BTF_PTR_ENC(3), /* [1] ptr -> [3] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [2] struct s */ + BTF_MEMBER_ENC(NAME_TBD, 1, 0), + BTF_STRUCT_ENC(NAME_NTH(2), 0, 0), /* [3] struct x */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0x"), + }, + .opts = { + .dont_resolve_fwds = false, + .dedup_table_size = 1, /* force hash collisions */ + }, +}, { .descr = "dedup: all possible kinds (no duplicates)", .input = { -- Gitee From c9daac93ed5f4390669cb4c877b2837a9d8ad0ad Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: [PATCH 14/23] bpf: add bpf helper bpf_skb_ecn_set_ce ANBZ: #5530 commit f7c917ba11a67632a8452ea99fe132f626a7a2cc upstream. This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. [backport note] The former patch "bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper" introduced bpf_get_listener_sock which is introduced after bpf_skb_ecn_set_ce in upstream so code change in description and __BPF_FUNC_MAPPER in include/uapi/linux/bpf.h has already applied in the former patch. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index da08f2b51743..f250d398015f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5380,6 +5380,32 @@ static const struct bpf_func_proto bpf_get_listener_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5538,6 +5564,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; #endif -- Gitee From ef0efa031717f75ad74ba0814d465b2ba982efe3 Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:47 -0800 Subject: [PATCH 15/23] bpf: sync bpf.h to tools and update bpf_helpers.h ANBZ: #5530 commit 5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 upstream. This patch syncs the uapi bpf.h to tools/ and also updates bpf_herlpers.h in tools/ Signed-off-by: Lawrence Brakmo Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index a908dd501bdd..eaf911037a11 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -176,6 +176,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) BPF_FUNC_sk_fullsock; static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_tcp_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_get_listener_sock; -- Gitee From 8960db216a46fba3fda838b085bd9c02e38aa784 Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:48 -0800 Subject: [PATCH 16/23] bpf: Sample HBM BPF program to limit egress bw ANBZ: #5530 commit 187d0738ff351f725a58be3d606d3a7fc8db8aed upstream. A cgroup skb BPF program to limit cgroup output bandwidth. It uses a modified virtual token bucket queue to limit average egress bandwidth. The implementation uses credits instead of tokens. Negative credits imply that queueing would have happened (this is a virtual queue, so no queueing is done by it. However, queueing may occur at the actual qdisc (which is not used for rate limiting). This implementation uses 3 thresholds, one to start marking packets and the other two to drop packets: CREDIT - <--------------------------|------------------------> + | | | 0 | Large pkt | | drop thresh | Small pkt drop Mark threshold thresh The effect of marking depends on the type of packet: a) If the packet is ECN enabled, then the packet is ECN ce marked. The current mark threshold is tuned for DCTCP. c) Else, it is dropped if it is a large packet. If the credit is below the drop threshold, the packet is dropped. Note that dropping a packet through the BPF program does not trigger CWR (Congestion Window Reduction) in TCP packets. A future patch will add support for triggering CWR. This BPF program actually uses 2 drop thresholds, one threshold for larger packets (>= 120 bytes) and another for smaller packets. This protects smaller packets such as SYNs, ACKs, etc. The default bandwidth limit is set at 1Gbps but this can be changed by a user program through a shared BPF map. In addition, by default this BPF program does not limit connections using loopback. This behavior can be overwritten by the user program. There is also an option to calculate some statistics, such as percent of packets marked or dropped, which the user program can access. A latter patch provides such a program (hbm.c) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- samples/bpf/Makefile | 2 + samples/bpf/hbm.h | 31 ++++++++ samples/bpf/hbm_kern.h | 137 ++++++++++++++++++++++++++++++++ samples/bpf/hbm_out_kern.c | 157 +++++++++++++++++++++++++++++++++++++ 4 files changed, 327 insertions(+) create mode 100644 samples/bpf/hbm.h create mode 100644 samples/bpf/hbm_kern.h create mode 100644 samples/bpf/hbm_out_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index d6b9da7b5bcf..ef32859701fe 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -166,6 +166,7 @@ always += xdpsock_kern.o always += xdp_fwd_kern.o always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o +always += hbm_out_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -267,6 +268,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..518e8147d084 --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1; /* also limit flows using loopback */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; +}; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..c5635d924193 --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 + +#define HBM_DEBUG 0 // Set to 1 to enable debugging +#if HBM_DEBUG +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) +#else +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) + +struct bpf_map_def SEC("maps") queue_state = { + .type = BPF_MAP_TYPE_CGROUP_STORAGE, + .key_size = sizeof(struct bpf_cgroup_storage_key), + .value_size = sizeof(struct hbm_vqueue), +}; +BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, + struct hbm_vqueue); + +struct bpf_map_def SEC("maps") queue_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct hbm_queue_stats), + .max_entries = 1, +}; +BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); + +struct hbm_pkt_info { + bool is_ip; + bool is_tcp; + short ecn; +}; + +static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag) +{ + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag || drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..f806863d0b79 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, zero = 0; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + credit += CREDIT_PER_NS(delta, qdp->rate); + if (credit > MAX_CREDIT) + credit = MAX_CREDIT; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && + credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop flag + drop_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (!bpf_skb_ecn_set_ce(skb)) { + if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (drop_flag) + rv = DROP_PKT; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qdp->credit), len); + + return rv; +} +char _license[] SEC("license") = "GPL"; -- Gitee From 5549cbc0580ab41a9a2f7c43ce19f6bdf749fc18 Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:49 -0800 Subject: [PATCH 17/23] bpf: User program for testing HBM ANBZ: #5530 commit a1270fe95b74eb3195b107c494ed1f11b932a278 upstream. The program nrm creates a cgroup and attaches a BPF program to the cgroup for testing HBM (Host Bandwidth Manager) for egress traffic. One still needs to create network traffic. This can be done through netesto, netperf or iperf3. A follow-up patch contains a script to create traffic. USAGE: hbm [-d] [-l] [-n ] [-r ] [-s] [-t ] [-w] [-h] [prog] Where: -d Print BPF trace debug buffer -l Also limit flows doing loopback -n <#> To create cgroup "/hbm#" and attach prog. Default is /nrm1 This is convenient when testing HBM in more than 1 cgroup -r Rate limit in Mbps -s Get HBM stats (marked, dropped, etc.) -t