From 27ed06aca4f9431e7f5b5aeeb28c6814f8cf81be Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 24 Apr 2019 21:50:42 +0200 Subject: [PATCH 01/26] bpf: mark registers in all frames after pkt/null checks ANBZ: #5530 commit c6a9efa1d8353d8960d152e7d469d952b01495c0 upstream. In case of a null check on a pointer inside a subprog, we should mark all registers with this pointer as either safe or unknown, in both the current and previous frames. Currently, only spilled registers and registers in the current frame are marked. Packet bound checks in subprogs have the same issue. This patch fixes it to mark registers in previous frames as well. A good reproducer for null checks looks as follow: 1: ptr = bpf_map_lookup_elem(map, &key); 2: ret = subprog(ptr) { 3: return ptr != NULL; 4: } 5: if (ret) 6: value = *ptr; With the above, the verifier will complain on line 6 because it sees ptr as map_value_or_null despite the null check in subprog 1. Note that this patch fixes another resulting bug when using bpf_sk_release(): 1: sk = bpf_sk_lookup_tcp(...); 2: subprog(sk) { 3: if (sk) 4: bpf_sk_release(sk); 5: } 6: if (!sk) 7: return 0; 8: return 1; In the above, mark_ptr_or_null_regs will warn on line 6 because it will try to free the reference state, even though it was already freed on line 3. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3efc91b1376a..376fb6f8dd00 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4465,15 +4465,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static void __find_good_pkt_pointers(struct bpf_func_state *state, + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, u16 new_range) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + } + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } +} + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4538,20 +4558,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) - /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, new_range); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } - } + for (i = 0; i <= vstate->curframe; i++) + __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, + new_range); } /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -5025,6 +5034,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, + bool is_null) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + mark_ptr_or_null_reg(state, reg, id, is_null); + } +} + /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -5032,10 +5057,10 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i, j; + int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -5044,17 +5069,8 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } - } + for (i = 0; i <= vstate->curframe; i++) + __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, -- Gitee From 6220b943cbe6fcfc2e98397824d691f3aab06507 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:23 -0700 Subject: [PATCH 02/26] bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type ANBZ: #5530 commit 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 upstream. target_fd is target namespace. If there is a flow dissector BPF program attached to that namespace, its (single) id is returned. v5: * drop net ref right after rcu unlock (Daniel Borkmann) v4: * add missing put_net (Jann Horn) v3: * add missing inline to skb_flow_dissector_prog_query static def (kbuild test robot ) v2: * don't sleep in rcu critical section (Jakub Kicinski) * check input prog_cnt (exit early) Cc: Jann Horn Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/skbuff.h | 8 ++++++++ kernel/bpf/syscall.c | 2 ++ net/core/flow_dissector.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 16523a7c4f18..b32df2de5993 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1216,11 +1216,19 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); #ifdef CONFIG_NET +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); #else +static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index beeddb1abf55..565d2ac21478 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2027,6 +2027,8 @@ static int bpf_prog_query(const union bpf_attr *attr, break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4725fcf9e8ea..09db923f398f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -65,6 +65,45 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->flow_dissector_prog); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { -- Gitee From 30ffefea8a3cad44117853ecddf82fe9d283d13e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:24 -0700 Subject: [PATCH 03/26] bpftool: show flow_dissector attachment status ANBZ: #5530 commit 7f0c57fec80f198ae9fcd06e5bbca13196815a4b upstream. Right now there is no way to query whether BPF flow_dissector program is attached to a network namespace or not. In previous commit, I added support for querying that info, show it when doing `bpftool net`: $ bpftool prog loadall ./bpf_flow.o \ /sys/fs/bpf/flow type flow_dissector \ pinmaps /sys/fs/bpf/flow $ bpftool prog 3: flow_dissector name _dissect tag 8c9e917b513dd5cc gpl loaded_at 2019-04-23T16:14:48-0700 uid 0 xlated 656B jited 461B memlock 4096B map_ids 1,2 btf_id 1 ... $ bpftool net -j [{"xdp":[],"tc":[],"flow_dissector":[]}] $ bpftool prog attach pinned \ /sys/fs/bpf/flow/flow_dissector flow_dissector $ bpftool net -j [{"xdp":[],"tc":[],"flow_dissector":["id":3]}] Doesn't show up in a different net namespace: $ ip netns add test $ ip netns exec test bpftool net -j [{"xdp":[],"tc":[],"flow_dissector":[]}] Non-json output: $ bpftool net xdp: tc: flow_dissector: id 3 v2: * initialization order (Jakub Kicinski) * clear errno for batch mode (Quentin Monnet) Signed-off-by: Stanislav Fomichev Reviewed-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/net.c | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 10f9693c0213..d4432af7d0cf 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include +#include #include #include #include @@ -11,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -47,6 +50,10 @@ struct bpf_filter_t { int ifindex; }; +struct bpf_attach_info { + __u32 flow_dissector_id; +}; + static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) { struct bpf_netdev_t *netinfo = cookie; @@ -179,8 +186,45 @@ static int show_dev_tc_bpf(int sock, unsigned int nl_pid, return 0; } +static int query_flow_dissector(struct bpf_attach_info *attach_info) +{ + __u32 attach_flags; + __u32 prog_ids[1]; + __u32 prog_cnt; + int err; + int fd; + + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + p_err("can't open /proc/self/ns/net: %d", + strerror(errno)); + return -1; + } + prog_cnt = ARRAY_SIZE(prog_ids); + err = bpf_prog_query(fd, BPF_FLOW_DISSECTOR, 0, + &attach_flags, prog_ids, &prog_cnt); + close(fd); + if (err) { + if (errno == EINVAL) { + /* Older kernel's don't support querying + * flow dissector programs. + */ + errno = 0; + return 0; + } + p_err("can't query prog: %s", strerror(errno)); + return -1; + } + + if (prog_cnt == 1) + attach_info->flow_dissector_id = prog_ids[0]; + + return 0; +} + static int do_show(int argc, char **argv) { + struct bpf_attach_info attach_info = {}; int i, sock, ret, filter_idx = -1; struct bpf_netdev_t dev_array; unsigned int nl_pid; @@ -198,6 +242,10 @@ static int do_show(int argc, char **argv) usage(); } + ret = query_flow_dissector(&attach_info); + if (ret) + return -1; + sock = libbpf_netlink_open(&nl_pid); if (sock < 0) { fprintf(stderr, "failed to open netlink sock\n"); @@ -226,6 +274,12 @@ static int do_show(int argc, char **argv) } NET_END_ARRAY("\n"); } + + NET_START_ARRAY("flow_dissector", "%s:\n"); + if (attach_info.flow_dissector_id > 0) + NET_DUMP_UINT("id", "id %u", attach_info.flow_dissector_id); + NET_END_ARRAY("\n"); + NET_END_OBJECT; if (json_output) jsonw_end_array(json_wtr); -- Gitee From 0dce03fe702d0966654d997c59f21591cbf7891a Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 15 Aug 2019 15:32:18 +0100 Subject: [PATCH 04/26] tools: bpftool: fix format string for p_err() in query_flow_dissector() ANBZ: #5530 commit 8a15d5ced8c626c0331974c7281c1d651f7b0d83 upstream. The format string passed to one call to the p_err() function in query_flow_dissector() does not match the value that should be printed, resulting in some garbage integer being printed instead of strerror(errno) if /proc/self/ns/net cannot be open. Let's fix the format string. Fixes: 7f0c57fec80f ("bpftool: show flow_dissector attachment status") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index d4432af7d0cf..b47cc2fff11e 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -196,7 +196,7 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info) fd = open("/proc/self/ns/net", O_RDONLY); if (fd < 0) { - p_err("can't open /proc/self/ns/net: %d", + p_err("can't open /proc/self/ns/net: %s", strerror(errno)); return -1; } -- Gitee From 6d0cb8e7b0b04c24a56bd4e74c590dd7cae45142 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Apr 2019 15:30:08 -0700 Subject: [PATCH 05/26] bpftool: add ability to dump BTF types ANBZ: #5530 commit c93cc69004df340d71a9ab3433b8e5c9fd1fca7a upstream. Add new `btf dump` sub-command to bpftool. It allows to dump human-readable low-level BTF types representation of BTF types. BTF can be retrieved from few different sources: - from BTF object by ID; - from PROG, if it has associated BTF; - from MAP, if it has associated BTF data; it's possible to narrow down types to either key type, value type, both, or all BTF types; - from ELF file (.BTF section). Output format mostly follows BPF verifier log format with few notable exceptions: - all the type/field/param/etc names are enclosed in single quotes to allow easier grepping and to stand out a little bit more; - FUNC_PROTO output follows STRUCT/UNION/ENUM format of having one line per each argument; this is more uniform and allows easy grepping, as opposed to succinct, but inconvenient format that BPF verifier log is using. Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Martin KaFai Lau Cc: Song Liu Cc: Arnaldo Carvalho de Melo Acked-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/btf.c | 586 +++++++++++++++++++++++++++++++++++++++ tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 1 + 3 files changed, 589 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/btf.c diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c new file mode 100644 index 000000000000..58a2cd002a4b --- /dev/null +++ b/tools/bpf/bpftool/btf.c @@ -0,0 +1,586 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "btf.h" +#include "json_writer.h" +#include "main.h" + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", +}; + +static const char *btf_int_enc_str(__u8 encoding) +{ + switch (encoding) { + case 0: + return "(none)"; + case BTF_INT_SIGNED: + return "SIGNED"; + case BTF_INT_CHAR: + return "CHAR"; + case BTF_INT_BOOL: + return "BOOL"; + default: + return "UNKN"; + } +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: + return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: + return "global-alloc"; + default: + return "(unknown)"; + } +} + +static const char *btf_str(const struct btf *btf, __u32 off) +{ + if (!off) + return "(anon)"; + return btf__name_by_offset(btf, off) ? : "(invalid)"; +} + +static int dump_btf_type(const struct btf *btf, __u32 id, + const struct btf_type *t) +{ + json_writer_t *w = json_wtr; + int kind, safe_kind; + + kind = BTF_INFO_KIND(t->info); + safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; + + if (json_output) { + jsonw_start_object(w); + jsonw_uint_field(w, "id", id); + jsonw_string_field(w, "kind", btf_kind_str[safe_kind]); + jsonw_string_field(w, "name", btf_str(btf, t->name_off)); + } else { + printf("[%u] %s '%s'", id, btf_kind_str[safe_kind], + btf_str(btf, t->name_off)); + } + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: { + __u32 v = *(__u32 *)(t + 1); + const char *enc; + + enc = btf_int_enc_str(BTF_INT_ENCODING(v)); + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "bits_offset", BTF_INT_OFFSET(v)); + jsonw_uint_field(w, "nr_bits", BTF_INT_BITS(v)); + jsonw_string_field(w, "encoding", enc); + } else { + printf(" size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(v), BTF_INT_BITS(v), + enc); + } + break; + } + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + if (json_output) + jsonw_uint_field(w, "type_id", t->type); + else + printf(" type_id=%u", t->type); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *arr = (const void *)(t + 1); + + if (json_output) { + jsonw_uint_field(w, "type_id", arr->type); + jsonw_uint_field(w, "index_type_id", arr->index_type); + jsonw_uint_field(w, "nr_elems", arr->nelems); + } else { + printf(" type_id=%u index_type_id=%u nr_elems=%u", + arr->type, arr->index_type, arr->nelems); + } + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "members"); + jsonw_start_array(w); + } else { + printf(" size=%u vlen=%u", t->size, vlen); + } + for (i = 0; i < vlen; i++, m++) { + const char *name = btf_str(btf, m->name_off); + __u32 bit_off, bit_sz; + + if (BTF_INFO_KFLAG(t->info)) { + bit_off = BTF_MEMBER_BIT_OFFSET(m->offset); + bit_sz = BTF_MEMBER_BITFIELD_SIZE(m->offset); + } else { + bit_off = m->offset; + bit_sz = 0; + } + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + jsonw_uint_field(w, "type_id", m->type); + jsonw_uint_field(w, "bits_offset", bit_off); + if (bit_sz) { + jsonw_uint_field(w, "bitfield_size", + bit_sz); + } + jsonw_end_object(w); + } else { + printf("\n\t'%s' type_id=%u bits_offset=%u", + name, m->type, bit_off); + if (bit_sz) + printf(" bitfield_size=%u", bit_sz); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *v = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "values"); + jsonw_start_array(w); + } else { + printf(" size=%u vlen=%u", t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + const char *name = btf_str(btf, v->name_off); + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + jsonw_uint_field(w, "val", v->val); + jsonw_end_object(w); + } else { + printf("\n\t'%s' val=%u", name, v->val); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_FWD: { + const char *fwd_kind = BTF_INFO_KIND(t->info) ? "union" + : "struct"; + + if (json_output) + jsonw_string_field(w, "fwd_kind", fwd_kind); + else + printf(" fwd_kind=%s", fwd_kind); + break; + } + case BTF_KIND_FUNC: + if (json_output) + jsonw_uint_field(w, "type_id", t->type); + else + printf(" type_id=%u", t->type); + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "ret_type_id", t->type); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "params"); + jsonw_start_array(w); + } else { + printf(" ret_type_id=%u vlen=%u", t->type, vlen); + } + for (i = 0; i < vlen; i++, p++) { + const char *name = btf_str(btf, p->name_off); + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + jsonw_uint_field(w, "type_id", p->type); + jsonw_end_object(w); + } else { + printf("\n\t'%s' type_id=%u", name, p->type); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_VAR: { + const struct btf_var *v = (const void *)(t + 1); + const char *linkage; + + linkage = btf_var_linkage_str(v->linkage); + + if (json_output) { + jsonw_uint_field(w, "type_id", t->type); + jsonw_string_field(w, "linkage", linkage); + } else { + printf(" type_id=%u, linkage=%s", t->type, linkage); + } + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = (const void *)(t+1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "vars"); + jsonw_start_array(w); + } else { + printf(" size=%u vlen=%u", t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + if (json_output) { + jsonw_start_object(w); + jsonw_uint_field(w, "type_id", v->type); + jsonw_uint_field(w, "offset", v->offset); + jsonw_uint_field(w, "size", v->size); + jsonw_end_object(w); + } else { + printf("\n\ttype_id=%u offset=%u size=%u", + v->type, v->offset, v->size); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + default: + break; + } + + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + + return 0; +} + +static int dump_btf_raw(const struct btf *btf, + __u32 *root_type_ids, int root_type_cnt) +{ + const struct btf_type *t; + int i; + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "types"); + jsonw_start_array(json_wtr); + } + + if (root_type_cnt) { + for (i = 0; i < root_type_cnt; i++) { + t = btf__type_by_id(btf, root_type_ids[i]); + dump_btf_type(btf, root_type_ids[i], t); + } + } else { + int cnt = btf__get_nr_types(btf); + + for (i = 1; i <= cnt; i++) { + t = btf__type_by_id(btf, i); + dump_btf_type(btf, i, t); + } + } + + if (json_output) { + jsonw_end_array(json_wtr); + jsonw_end_object(json_wtr); + } + return 0; +} + +static bool check_btf_endianness(GElf_Ehdr *ehdr) +{ + static unsigned int const endian = 1; + + switch (ehdr->e_ident[EI_DATA]) { + case ELFDATA2LSB: + return *(unsigned char const *)&endian == 1; + case ELFDATA2MSB: + return *(unsigned char const *)&endian == 0; + default: + return 0; + } +} + +static int btf_load_from_elf(const char *path, struct btf **btf) +{ + int err = -1, fd = -1, idx = 0; + Elf_Data *btf_data = NULL; + Elf_Scn *scn = NULL; + Elf *elf = NULL; + GElf_Ehdr ehdr; + + if (elf_version(EV_CURRENT) == EV_NONE) { + p_err("failed to init libelf for %s", path); + return -1; + } + + fd = open(path, O_RDONLY); + if (fd < 0) { + p_err("failed to open %s: %s", path, strerror(errno)); + return -1; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + p_err("failed to open %s as ELF file", path); + goto done; + } + if (!gelf_getehdr(elf, &ehdr)) { + p_err("failed to get EHDR from %s", path); + goto done; + } + if (!check_btf_endianness(&ehdr)) { + p_err("non-native ELF endianness is not supported"); + goto done; + } + if (!elf_rawdata(elf_getscn(elf, ehdr.e_shstrndx), NULL)) { + p_err("failed to get e_shstrndx from %s\n", path); + goto done; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + char *name; + + idx++; + if (gelf_getshdr(scn, &sh) != &sh) { + p_err("failed to get section(%d) header from %s", + idx, path); + goto done; + } + name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name); + if (!name) { + p_err("failed to get section(%d) name from %s", + idx, path); + goto done; + } + if (strcmp(name, BTF_ELF_SEC) == 0) { + btf_data = elf_getdata(scn, 0); + if (!btf_data) { + p_err("failed to get section(%d, %s) data from %s", + idx, name, path); + goto done; + } + break; + } + } + + if (!btf_data) { + p_err("%s ELF section not found in %s", BTF_ELF_SEC, path); + goto done; + } + + *btf = btf__new(btf_data->d_buf, btf_data->d_size); + if (IS_ERR(*btf)) { + err = PTR_ERR(*btf); + *btf = NULL; + p_err("failed to load BTF data from %s: %s", + path, strerror(err)); + goto done; + } + + err = 0; +done: + if (err) { + if (*btf) { + btf__free(*btf); + *btf = NULL; + } + } + if (elf) + elf_end(elf); + close(fd); + return err; +} + +static int do_dump(int argc, char **argv) +{ + struct btf *btf = NULL; + __u32 root_type_ids[2]; + int root_type_cnt = 0; + __u32 btf_id = -1; + const char *src; + int fd = -1; + int err; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + src = GET_ARG(); + + if (is_prefix(src, "map")) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + btf_id = info.btf_id; + if (argc && is_prefix(*argv, "key")) { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "value")) { + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "all")) { + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "kv")) { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + NEXT_ARG(); + } else { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + } + } else if (is_prefix(src, "prog")) { + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get prog info: %s", strerror(errno)); + goto done; + } + + btf_id = info.btf_id; + } else if (is_prefix(src, "id")) { + char *endptr; + + btf_id = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(src, "file")) { + err = btf_load_from_elf(*argv, &btf); + if (err) + goto done; + NEXT_ARG(); + } else { + err = -1; + p_err("unrecognized BTF source specifier: '%s'", src); + goto done; + } + + if (!btf) { + err = btf__get_from_id(btf_id, &btf); + if (err) { + p_err("get btf by id (%u): %s", btf_id, strerror(err)); + goto done; + } + if (!btf) { + err = ENOENT; + p_err("can't find btf with ID (%u)", btf_id); + goto done; + } + } + + dump_btf_raw(btf, root_type_ids, root_type_cnt); + +done: + close(fd); + btf__free(btf); + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s btf dump BTF_SRC\n" + " %s btf help\n" + "\n" + " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" + " " HELP_SPEC_MAP "\n" + " " HELP_SPEC_PROGRAM "\n" + " " HELP_SPEC_OPTIONS "\n" + "", + bin_name, bin_name); + + return 0; +} + +static const struct cmd cmds[] = { + { "help", do_help }, + { "dump", do_dump }, + { 0 } +}; + +int do_btf(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index a3658eb73ffc..c85809a5c2b5 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -56,7 +56,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net | feature }\n" + " OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -188,6 +188,7 @@ static const struct cmd cmds[] = { { "perf", do_perf }, { "net", do_net }, { "feature", do_feature }, + { "btf", do_btf }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 1ccc46169a19..3d63feb7f852 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -150,6 +150,7 @@ int do_perf(int argc, char **arg); int do_net(int argc, char **arg); int do_tracelog(int argc, char **arg); int do_feature(int argc, char **argv); +int do_btf(int argc, char **argv); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); -- Gitee From c5e54f35af9b9702889202c1eb3782feecc55bce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Apr 2019 15:30:09 -0700 Subject: [PATCH 06/26] bpftool/docs: add btf sub-command documentation ANBZ: #5530 commit ca253339af928528ceb34770b076d3230ed7d629 upstream. Document usage and sample output format for `btf dump` sub-command. Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Martin KaFai Lau Cc: Song Liu Cc: Arnaldo Carvalho de Melo Acked-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../bpf/bpftool/Documentation/bpftool-btf.rst | 222 ++++++++++++++++++ .../bpftool/Documentation/bpftool-cgroup.rst | 3 +- .../bpftool/Documentation/bpftool-feature.rst | 3 +- .../bpf/bpftool/Documentation/bpftool-map.rst | 3 +- .../bpf/bpftool/Documentation/bpftool-net.rst | 3 +- .../bpftool/Documentation/bpftool-perf.rst | 3 +- .../bpftool/Documentation/bpftool-prog.rst | 3 +- tools/bpf/bpftool/Documentation/bpftool.rst | 3 +- 8 files changed, 236 insertions(+), 7 deletions(-) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-btf.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst new file mode 100644 index 000000000000..2dbc1413fabd --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -0,0 +1,222 @@ +================ +bpftool-btf +================ +------------------------------------------------------------------------------- +tool for inspection of BTF data +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **btf** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + + *COMMANDS* := { **dump** | **help** } + +BTF COMMANDS +============= + +| **bpftool** **btf dump** *BTF_SRC* +| **bpftool** **btf help** +| +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } + +DESCRIPTION +=========== + **bpftool btf dump** *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. + + When **id** is specified, BTF object with that ID will be + loaded and all its BTF types emitted. + + When **map** is provided, it's expected that map has + associated BTF object with BTF types describing key and + value. It's possible to select whether to dump only BTF + type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in + associated BTF object (**all**). If not specified, **kv** + is assumed. + + When **prog** is provided, it's expected that program has + associated BTF object with BTF types. + + When specifying *FILE*, an ELF file is expected, containing + .BTF section with well-defined BTF binary format data, + typically produced by clang or pahole. + + **bpftool btf help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +EXAMPLES +======== +**# bpftool btf dump id 1226** +:: + + [1] PTR '(anon)' type_id=2 + [2] STRUCT 'dummy_tracepoint_args' size=16 vlen=2 + 'pad' type_id=3 bits_offset=0 + 'sock' type_id=4 bits_offset=64 + [3] INT 'long long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none) + [4] PTR '(anon)' type_id=5 + [5] FWD 'sock' fwd_kind=union + +This gives an example of default output for all supported BTF kinds. + +**$ cat prog.c** +:: + + struct fwd_struct; + + enum my_enum { + VAL1 = 3, + VAL2 = 7, + }; + + typedef struct my_struct my_struct_t; + + struct my_struct { + const unsigned int const_int_field; + int bitfield_field: 4; + char arr_field[16]; + const struct fwd_struct *restrict fwd_field; + enum my_enum enum_field; + volatile my_struct_t *typedef_ptr_field; + }; + + union my_union { + int a; + struct my_struct b; + }; + + struct my_struct struct_global_var __attribute__((section("data_sec"))) = { + .bitfield_field = 3, + .enum_field = VAL1, + }; + int global_var __attribute__((section("data_sec"))) = 7; + + __attribute__((noinline)) + int my_func(union my_union *arg1, int arg2) + { + static int static_var __attribute__((section("data_sec"))) = 123; + static_var++; + return static_var; + } + +**$ bpftool btf dump file prog.o** +:: + + [1] PTR '(anon)' type_id=2 + [2] UNION 'my_union' size=48 vlen=2 + 'a' type_id=3 bits_offset=0 + 'b' type_id=4 bits_offset=0 + [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED + [4] STRUCT 'my_struct' size=48 vlen=6 + 'const_int_field' type_id=5 bits_offset=0 + 'bitfield_field' type_id=3 bits_offset=32 bitfield_size=4 + 'arr_field' type_id=8 bits_offset=40 + 'fwd_field' type_id=10 bits_offset=192 + 'enum_field' type_id=14 bits_offset=256 + 'typedef_ptr_field' type_id=15 bits_offset=320 + [5] CONST '(anon)' type_id=6 + [6] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none) + [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED + [8] ARRAY '(anon)' type_id=7 index_type_id=9 nr_elems=16 + [9] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none) + [10] RESTRICT '(anon)' type_id=11 + [11] PTR '(anon)' type_id=12 + [12] CONST '(anon)' type_id=13 + [13] FWD 'fwd_struct' fwd_kind=union + [14] ENUM 'my_enum' size=4 vlen=2 + 'VAL1' val=3 + 'VAL2' val=7 + [15] PTR '(anon)' type_id=16 + [16] VOLATILE '(anon)' type_id=17 + [17] TYPEDEF 'my_struct_t' type_id=4 + [18] FUNC_PROTO '(anon)' ret_type_id=3 vlen=2 + 'arg1' type_id=1 + 'arg2' type_id=3 + [19] FUNC 'my_func' type_id=18 + [20] VAR 'struct_global_var' type_id=4, linkage=global-alloc + [21] VAR 'global_var' type_id=3, linkage=global-alloc + [22] VAR 'my_func.static_var' type_id=3, linkage=static + [23] DATASEC 'data_sec' size=0 vlen=3 + type_id=20 offset=0 size=48 + type_id=21 offset=0 size=4 + type_id=22 offset=52 size=4 + +The following commands print BTF types associated with specified map's key, +value, both key and value, and all BTF types, respectively. By default, both +key and value types will be printed. + +**# bpftool btf dump map id 123 key** + +:: + + [39] TYPEDEF 'u32' type_id=37 + +**# bpftool btf dump map id 123 value** + +:: + + [86] PTR '(anon)' type_id=87 + +**# bpftool btf dump map id 123 kv** + +:: + + [39] TYPEDEF 'u32' type_id=37 + [86] PTR '(anon)' type_id=87 + +**# bpftool btf dump map id 123 all** + +:: + + [1] PTR '(anon)' type_id=0 + . + . + . + [2866] ARRAY '(anon)' type_id=52 index_type_id=51 nr_elems=4 + +All the standard ways to specify map or program are supported: + +**# bpftool btf dump map id 123** + +**# bpftool btf dump map pinned /sys/fs/bpf/map_name** + +**# bpftool btf dump prog id 456** + +**# bpftool btf dump prog tag b88e0a09b1d9759d** + +**# bpftool btf dump prog pinned /sys/fs/bpf/prog_name** + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-map**\ (8), + **bpftool-prog**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 89b6b10e2183..ac26876389c2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -145,4 +145,5 @@ SEE ALSO **bpftool-map**\ (8), **bpftool-feature**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 10177343b7ce..14180e887082 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -82,4 +82,5 @@ SEE ALSO **bpftool-map**\ (8), **bpftool-cgroup**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 55ecf80ca03e..13ef27b39f20 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -258,4 +258,5 @@ SEE ALSO **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 6b692b428373..934580850f42 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -143,4 +143,5 @@ SEE ALSO **bpftool-map**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 86154740fabb..0c7576523a21 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -85,4 +85,5 @@ SEE ALSO **bpftool-map**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), - **bpftool-net**\ (8) + **bpftool-net**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 2f183ffd8351..e8118544d118 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -271,4 +271,5 @@ SEE ALSO **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index deb2ca911ddf..3e562d7fd56f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -76,4 +76,5 @@ SEE ALSO **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) -- Gitee From bc5427ae133b8d82e4f85385dcf16fb22524f4e1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Apr 2019 15:30:10 -0700 Subject: [PATCH 07/26] bpftool: add bash completions for btf command ANBZ: #5530 commit 4a714feefd99c25c7304b43ac58c9d5c0304e7cb upstream. Add full support for btf command in bash-completion script. Cc: Quentin Monnet Cc: Yonghong Song Cc: Alexei Starovoitov Cc: Daniel Borkmann Reviewed-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/bash-completion/bpftool | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9f3ffe1e26ab..bca91d04ed35 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -217,6 +217,7 @@ _bpftool() done cur=${words[cword]} prev=${words[cword - 1]} + pprev=${words[cword - 2]} local object=${words[1]} command=${words[2]} @@ -607,6 +608,51 @@ _bpftool() ;; esac ;; + btf) + local PROG_TYPE='id pinned tag' + local MAP_TYPE='id pinned' + case $command in + dump) + case $prev in + $command) + COMPREPLY+=( $( compgen -W "id map prog file" -- \ + "$cur" ) ) + return 0 + ;; + prog) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + map) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + case $pprev in + prog) + _bpftool_get_prog_ids + ;; + map) + _bpftool_get_map_ids + ;; + esac + return 0 + ;; + *) + if [[ $cword == 6 ]] && [[ ${words[3]} == "map" ]]; then + COMPREPLY+=( $( compgen -W 'key value kv all' -- \ + "$cur" ) ) + fi + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'dump help' -- "$cur" ) ) + ;; + esac + ;; cgroup) case $command in show|list) -- Gitee From 7f526ddff53c39bc64a612f19ee5e57ccaf5626d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Apr 2019 15:30:11 -0700 Subject: [PATCH 08/26] bpftool: fix indendation in bash-completion/bpftool ANBZ: #5530 commit 8ed1875bf3a77d1653b895e41774aec9a341a97f upstream. Fix misaligned default case branch for `prog dump` sub-command. Reported-by: Quentin Monnet Cc: Yonghong Song Reviewed-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/bash-completion/bpftool | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index bca91d04ed35..50e402a5a9c8 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -273,17 +273,17 @@ _bpftool() "$cur" ) ) return 0 ;; - *) - _bpftool_once_attr 'file' - if _bpftool_search_list 'xlated'; then - COMPREPLY+=( $( compgen -W 'opcodes visual linum' -- \ - "$cur" ) ) - else - COMPREPLY+=( $( compgen -W 'opcodes linum' -- \ - "$cur" ) ) - fi - return 0 - ;; + *) + _bpftool_once_attr 'file' + if _bpftool_search_list 'xlated'; then + COMPREPLY+=( $( compgen -W 'opcodes visual linum' -- \ + "$cur" ) ) + else + COMPREPLY+=( $( compgen -W 'opcodes linum' -- \ + "$cur" ) ) + fi + return 0 + ;; esac ;; pin) -- Gitee From 55972e3d29c48b8d92c22091108d5d197b00cb40 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 16 May 2019 23:21:29 -0700 Subject: [PATCH 09/26] bpftool: fix BTF raw dump of FWD's fwd_kind ANBZ: #5530 commit 9c3ddee1246411a3c9c39bfa5457e49579027f0c upstream. kflag bit determines whether FWD is for struct or union. Use that bit. Fixes: c93cc69004df ("bpftool: add ability to dump BTF types") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/btf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 58a2cd002a4b..7317438ecd9e 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -208,8 +208,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, break; } case BTF_KIND_FWD: { - const char *fwd_kind = BTF_INFO_KIND(t->info) ? "union" - : "struct"; + const char *fwd_kind = BTF_INFO_KFLAG(t->info) ? "union" + : "struct"; if (json_output) jsonw_string_field(w, "fwd_kind", fwd_kind); -- Gitee From 5a1290b9987bf48b14498d13c5216f876a8463ea Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 15 Aug 2019 15:32:17 +0100 Subject: [PATCH 10/26] tools: bpftool: fix argument for p_err() in BTF do_dump() ANBZ: #5530 commit ed4a3983cd3eb93aaf80de8d8a36efed808acff2 upstream. The last argument passed to one call to the p_err() function is not correct, it should be "*argv" instead of "**argv". This may lead to a segmentation fault error if BTF id cannot be parsed correctly. Let's fix this. Fixes: c93cc69004dt ("bpftool: add ability to dump BTF types") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 7317438ecd9e..1fbf353364d6 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -517,7 +517,7 @@ static int do_dump(int argc, char **argv) btf_id = strtoul(*argv, &endptr, 0); if (*endptr) { - p_err("can't parse %s as ID", **argv); + p_err("can't parse %s as ID", *argv); return -1; } NEXT_ARG(); -- Gitee From 8c942a4d93926d1bb51c10b019b0d702cb6e2ead Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Sun, 2 Aug 2020 19:15:40 +0800 Subject: [PATCH 11/26] tools, bpftool: Fix wrong return value in do_dump() ANBZ: #5530 commit 041549b7b2c7811ec40e705c439211f00ade2dda upstream. In case of btf_id does not exist, a negative error code -ENOENT should be returned. Fixes: c93cc69004df3 ("bpftool: add ability to dump BTF types") Signed-off-by: Tianjia Zhang Signed-off-by: Daniel Borkmann Reviewed-by: Tobias Klauser Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200802111540.5384-1-tianjia.zhang@linux.alibaba.com Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 1fbf353364d6..5c67bbcdf914 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -539,7 +539,7 @@ static int do_dump(int argc, char **argv) goto done; } if (!btf) { - err = ENOENT; + err = -ENOENT; p_err("can't find btf with ID (%u)", btf_id); goto done; } -- Gitee From 88147221ec1954b23ce6ef73bb8b4b86454c62c7 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:42 -0700 Subject: [PATCH 12/26] mm/tlb: Provide default nmi_uaccess_okay() ANBZ: #5530 commit 5932c9fd19e6e5ac84756c5c32fe5155d9a6b458 upstream. x86 has an nmi_uaccess_okay(), but other architectures do not. Arch-independent code might need to know whether access to user addresses is ok in an NMI context or in other code whose execution context is unknown. Specifically, this function is needed for bpf_probe_write_user(). Add a default implementation of nmi_uaccess_okay() for architectures that do not have such a function. Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-23-namit@vmware.com Signed-off-by: Ingo Molnar Signed-off-by: Yuanhe Shu --- arch/x86/include/asm/tlbflush.h | 2 ++ include/asm-generic/tlb.h | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 79ec7add5f98..91690778faf9 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -290,6 +290,8 @@ static inline bool nmi_uaccess_okay(void) return true; } +#define nmi_uaccess_okay nmi_uaccess_okay + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db72ad39853b..9c1c3b35bd30 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -20,6 +20,15 @@ #include #include +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or switching + * the loaded mm. + */ +#ifndef nmi_uaccess_okay +# define nmi_uaccess_okay() true +#endif + #ifdef CONFIG_HAVE_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. -- Gitee From 358aaf4e856746ae44ac652562b9f7378542ba8c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:43 -0700 Subject: [PATCH 13/26] bpf: Fail bpf_probe_write_user() while mm is switched ANBZ: #5530 commit c7b6f29b6257532792fc722b68fcc0e00b5a856c upstream. When using a temporary mm, bpf_probe_write_user() should not be able to write to user memory, since user memory addresses may be used to map kernel memory. Detect these cases and fail bpf_probe_write_user() in such cases. Suggested-by: Jann Horn Reported-by: Jann Horn Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Borkmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-24-namit@vmware.com Signed-off-by: Ingo Molnar Signed-off-by: Yuanhe Shu --- kernel/trace/bpf_trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 07bccf9fb697..6143aa8b9aeb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,8 @@ #include #include +#include + #include "trace_probe.h" #include "trace.h" @@ -126,6 +128,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. + * + * nmi_uaccess_okay() ensures the probe is not run in an interim + * state, when the task or mm are switched. This is specifically + * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || @@ -133,6 +139,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; -- Gitee From 1c216b20c3103ed2eef98080ca4a9f0de54f5f96 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: [PATCH 14/26] bpf: add writable context for raw tracepoints ANBZ: #5530 commit 9df1c28bb75217b244257152ab7d788bb2a386d0 upstream. This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/linux/tracepoint-defs.h | 1 + include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++ 8 files changed, 91 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a8653137b52c..e9fde0bdfe0b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,6 +280,7 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ }; /* The information passed from prog-specific *_is_valid_access @@ -369,6 +370,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; + u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index eb1e4f867a5e..5a9975678d6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 49ba9cde7e4b..b29950a19205 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -45,6 +45,7 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; + u32 writable_size; } __aligned(32); #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 505dae0bed80..d6e556c0a085 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ +#define __DEFINE_EVENT(template, call, proto, args, size) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ }; +#define FIRST(x, ...) x + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +static inline void bpf_test_buffer_##call(void) \ +{ \ + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ + * dead-code-eliminated. \ + */ \ + FIRST(proto); \ + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ +} \ +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) + +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef DEFINE_EVENT_WRITABLE +#undef __DEFINE_EVENT +#undef FIRST + #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8d87ab34ef0..3d62b8beefef 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -169,6 +169,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 565d2ac21478..cd7c9da22a56 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1803,12 +1803,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -ENOMEM; raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 376fb6f8dd00..616aa8162381 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -404,6 +404,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -1987,6 +1988,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2131,6 +2158,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6143aa8b9aeb..a29375b9d9a1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -971,6 +971,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1251,6 +1272,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } -- Gitee From 6f7c9e6b47cac9ff4eeadc06647c8c2e741c3ff8 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:50 -0700 Subject: [PATCH 15/26] tools: sync bpf.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 4635b0ae4d26f87cf68dbab6740955dd1ad67cf4 upstream. This adds BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, and fixes up the error: enumeration value ‘BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE’ not handled in switch [-Werror=switch-enum] build errors it would otherwise cause in libbpf. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/include/uapi/linux/bpf.h | 10 +++++++++- tools/lib/bpf/libbpf.c | 1 + tools/lib/bpf/libbpf_probes.c | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 140e2e3f6122..763dcff87f92 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -169,6 +169,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { @@ -1738,12 +1739,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4428ebcac988..e277d20c1671 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2300,6 +2300,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_CGROUP_SYSCTL: return false; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 880c81228bbe..6635a31a7a16 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -95,6 +95,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_SK_REUSEPORT: -- Gitee From 2dd525345b6338b810d00f4b816c8754a1fee14e Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:48 -0700 Subject: [PATCH 16/26] nbd: trace sending nbd requests ANBZ: #5530 commit ea106722c76f08002b69a6983ed84dc18958ba48 upstream. This adds a tracepoint that can both observe the nbd request being sent to the server, as well as modify that request , e.g., setting a flag in the request that will cause the server to collect detailed tracing data. The struct request * being handled is included to permit correlation with the block tracepoints. Signed-off-by: Matt Mullins Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- MAINTAINERS | 1 + drivers/block/nbd.c | 5 ++++ include/trace/events/nbd.h | 56 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 include/trace/events/nbd.h diff --git a/MAINTAINERS b/MAINTAINERS index 459fdf97ff75..6fedb88397cf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10117,6 +10117,7 @@ L: linux-block@vger.kernel.org L: nbd@other.debian.org F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c +F: include/trace/events/nbd.h F: include/uapi/linux/nbd.h NETWORK DROP MONITOR diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2a9138dab13e..55e07dce7fa7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -44,6 +44,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); static int nbd_total_devices = 0; @@ -529,6 +532,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) handle = nbd_cmd_handle(cmd); memcpy(request.handle, &handle, sizeof(handle)); + trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); + dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", req, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h new file mode 100644 index 000000000000..5928255ed02e --- /dev/null +++ b/include/trace/events/nbd.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nbd + +#if !defined(_TRACE_NBD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NBD_H + +#include + +DECLARE_EVENT_CLASS(nbd_send_request, + + TP_PROTO(struct nbd_request *nbd_request, int index, + struct request *rq), + + TP_ARGS(nbd_request, index, rq), + + TP_STRUCT__entry( + __field(struct nbd_request *, nbd_request) + __field(u64, dev_index) + __field(struct request *, request) + ), + + TP_fast_assign( + __entry->nbd_request = 0; + __entry->dev_index = index; + __entry->request = rq; + ), + + TP_printk("nbd%lld: request %p", __entry->dev_index, __entry->request) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef NBD_DEFINE_EVENT +#define NBD_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef NBD_DEFINE_EVENT +#define NBD_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +NBD_DEFINE_EVENT(nbd_send_request, nbd_send_request, + + TP_PROTO(struct nbd_request *nbd_request, int index, + struct request *rq), + + TP_ARGS(nbd_request, index, rq), + + sizeof(struct nbd_request) +); + +#endif + +/* This part must be outside protection */ +#include -- Gitee From 008bb11c78387edb73bf95824c5bc68d3440edb7 Mon Sep 17 00:00:00 2001 From: Andrew Hall Date: Fri, 26 Apr 2019 11:49:49 -0700 Subject: [PATCH 17/26] nbd: add tracepoints for send/receive timing ANBZ: #5530 commit 2abd2de712cd891321a06b0890a85aef1e506cb5 upstream. This adds four tracepoints to nbd, enabling separate tracing of payload and header sending/receipt. In the send path for headers that have already been sent, we also explicitly initialize the handle so it can be referenced by the later tracepoint. Signed-off-by: Andrew Hall Signed-off-by: Matt Mullins Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- drivers/block/nbd.c | 8 ++++++ include/trace/events/nbd.h | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55e07dce7fa7..e6f836f1da04 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -516,6 +516,10 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) if (sent) { if (sent >= sizeof(request)) { skip = sent - sizeof(request); + + /* initialize handle for tracing purposes */ + handle = nbd_cmd_handle(cmd); + goto send_pages; } iov_iter_advance(&from, sent); @@ -539,6 +543,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, index, 1, &from, (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); + trace_nbd_header_sent(req, handle); if (result <= 0) { if (was_interrupted(result)) { /* If we havne't sent anything we can just return BUSY, @@ -611,6 +616,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) bio = next; } out: + trace_nbd_payload_sent(req, handle); nsock->pending = NULL; nsock->sent = 0; return 0; @@ -658,6 +664,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) tag, req); return ERR_PTR(-ENOENT); } + trace_nbd_header_received(req, handle); cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); @@ -717,6 +724,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) } } out: + trace_nbd_payload_received(req, handle); mutex_unlock(&cmd->lock); return ret ? ERR_PTR(ret) : cmd; } diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h index 5928255ed02e..9849956f34d8 100644 --- a/include/trace/events/nbd.h +++ b/include/trace/events/nbd.h @@ -7,6 +7,57 @@ #include +DECLARE_EVENT_CLASS(nbd_transport_event, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle), + + TP_STRUCT__entry( + __field(struct request *, req) + __field(u64, handle) + ), + + TP_fast_assign( + __entry->req = req; + __entry->handle = handle; + ), + + TP_printk( + "nbd transport event: request %p, handle 0x%016llx", + __entry->req, + __entry->handle + ) +); + +DEFINE_EVENT(nbd_transport_event, nbd_header_sent, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_payload_sent, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_header_received, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_payload_received, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + DECLARE_EVENT_CLASS(nbd_send_request, TP_PROTO(struct nbd_request *nbd_request, int index, -- Gitee From 101f34f2815533c39643040883581d3f9768560a Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:51 -0700 Subject: [PATCH 18/26] selftests: bpf: test writable buffers in raw tps ANBZ: #5530 commit e950e843367d7990b9d7ea964e3c33876d477c4b upstream. This tests that: * a BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE cannot be attached if it uses either: * a variable offset to the tracepoint buffer, or * an offset beyond the size of the tracepoint buffer * a tracer can modify the buffer provided when attached to a writable tracepoint in bpf_prog_test_run Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/trace/events/bpf_test_run.h | 50 +++++++++ net/bpf/test_run.c | 4 + tools/testing/selftests/bpf/test_progs.c | 117 ++++++++++++++++++++ tools/testing/selftests/bpf/test_verifier.c | 34 ++++++ 4 files changed, 205 insertions(+) create mode 100644 include/trace/events/bpf_test_run.h diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h new file mode 100644 index 000000000000..265447e3f71a --- /dev/null +++ b/include/trace/events/bpf_test_run.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_test_run + +#if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BPF_TEST_RUN_H + +#include + +DECLARE_EVENT_CLASS(bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + TP_STRUCT__entry( + __field(int, err) + ), + + TP_fast_assign( + __entry->err = *err; + ), + + TP_printk("bpf_test_finish with err=%d", __entry->err) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + sizeof(int) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8c9331913753..33e0dc168c16 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time) { @@ -101,6 +104,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, if (err != -ENOSPC) err = 0; out: + trace_bpf_test_finish(&err); return err; } diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 9a598f602acd..f1ed62dda796 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -27,6 +27,7 @@ typedef __u16 __sum16; #include #include #include +#include #include #include @@ -2617,6 +2618,120 @@ static void test_flow_dissector_load_bytes(void) close(fd); } +static void test_raw_tp_writable_reject_nbd_invalid(void) +{ + __u32 duration = 0; + char error[4096]; + int bpf_fd = -1, tp_fd = -1; + + const struct bpf_insn program[] = { + /* r6 is our tp buffer */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + /* one byte beyond the end of the nbd_request struct */ + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, + sizeof(struct nbd_request)), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr load_attr = { + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .license = "GPL v2", + .insns = program, + .insns_cnt = sizeof(program) / sizeof(struct bpf_insn), + .log_level = 2, + }; + + bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error)); + if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load", + "failed: %d errno %d\n", bpf_fd, errno)) + return; + + tp_fd = bpf_raw_tracepoint_open("nbd_send_request", bpf_fd); + if (CHECK(tp_fd >= 0, "bpf_raw_tracepoint_writable open", + "erroneously succeeded\n")) + goto out_bpffd; + + close(tp_fd); +out_bpffd: + close(bpf_fd); +} + +static void test_raw_tp_writable_test_run(void) +{ + __u32 duration = 0; + char error[4096]; + + const struct bpf_insn trace_program[] = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr load_attr = { + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .license = "GPL v2", + .insns = trace_program, + .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn), + .log_level = 2, + }; + + int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error)); + if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded", + "failed: %d errno %d\n", bpf_fd, errno)) + return; + + const struct bpf_insn skb_program[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr skb_load_attr = { + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .license = "GPL v2", + .insns = skb_program, + .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn), + }; + + int filter_fd = + bpf_load_program_xattr(&skb_load_attr, error, sizeof(error)); + if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n", + filter_fd, errno)) + goto out_bpffd; + + int tp_fd = bpf_raw_tracepoint_open("bpf_test_finish", bpf_fd); + if (CHECK(tp_fd < 0, "bpf_raw_tracepoint_writable opened", + "failed: %d errno %d\n", tp_fd, errno)) + goto out_filterfd; + + char test_skb[128] = { + 0, + }; + + __u32 prog_ret; + int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, + 0, &prog_ret, 0); + CHECK(err != 42, "test_run", + "tracepoint did not modify return value\n"); + CHECK(prog_ret != 0, "test_run_ret", + "socket_filter did not return 0\n"); + + close(tp_fd); + + err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0, + &prog_ret, 0); + CHECK(err != 0, "test_run_notrace", + "test_run failed with %d errno %d\n", err, errno); + CHECK(prog_ret != 0, "test_run_ret_notrace", + "socket_filter did not return 0\n"); + +out_filterfd: + close(filter_fd); +out_bpffd: + close(bpf_fd); +} + int main(int ac, char **av) { srand(time(NULL)); @@ -2655,6 +2770,8 @@ int main(int ac, char **av) test_signal_pending(BPF_PROG_TYPE_FLOW_DISSECTOR); test_bpf_verif_scale(); test_global_data(); + test_raw_tp_writable_reject_nbd_invalid(); + test_raw_tp_writable_test_run(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index eeea520f2348..8eb6317f4038 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -16978,6 +16978,40 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 2, }, + { + "raw_tracepoint_writable: reject variable offset", + .insns = { + /* r6 is our tp buffer */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + + BPF_LD_MAP_FD(BPF_REG_1, 0), + /* move the key (== 0) to r10-8 */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + /* lookup in the map */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + + /* exit clean if null */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + + /* shift the buffer pointer to a variable location */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_0), + /* clobber whatever's there */ + BPF_MOV64_IMM(BPF_REG_7, 4242), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, 0), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 1, }, + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)", + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- Gitee From eb2532e0db914ee2bbccc45736c7ea13cb5bd956 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 12 Jul 2019 10:35:39 +0900 Subject: [PATCH 19/26] tools: bpftool: add raw_tracepoint_writable prog type to header ANBZ: #5530 commit 216b65fb706e34128a5317d71b300daac9c428c4 upstream. ANBZ: #5530 commit 216b65fb706e34128a5317d71b300daac9c428c4 upstream. From commit 9df1c28bb752 ("bpf: add writable context for raw tracepoints"), a new type of BPF_PROG, RAW_TRACEPOINT_WRITABLE has been added. Since this BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE is not listed at bpftool's header, it causes a segfault when executing 'bpftool feature'. This commit adds BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE entry to prog_type_name enum, and will eventually fixes the segfault issue. Fixes: 9df1c28bb752 ("bpf: add writable context for raw tracepoints") Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/main.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 3d63feb7f852..ce75b76d17b1 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -74,6 +74,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", }; extern const char * const map_type_name[]; -- Gitee From 24a33fce1ff4b257d9a0c03b3c8dcc243898971c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:19 -0700 Subject: [PATCH 20/26] bpf: bump jmp sequence limit ANBZ: #5530 commit b285fcb760da7aa87d6d31e6c6a4907d82d9299c upstream. The limit of 1024 subsequent jumps was causing otherwise valid programs to be rejected. Bump it to 8192 and make the error more verbose. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 616aa8162381..5cc8113072b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -781,8 +781,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } return &elem->st; -- Gitee From 03d0d0e8be93f53896b4c57614ead12a35bcc80b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:20 -0700 Subject: [PATCH 21/26] selftests/bpf: adjust verifier scale test ANBZ: #5530 commit 7c0c6095d48dcd0e67c917aa73cdbb2715aafc36 upstream. Adjust scale tests to check for new jmp sequence limit. BPF_JGT had to be changed to BPF_JEQ because the verifier was too smart. It tracked the known safe range of R0 values and pruned the search earlier before hitting exact 8192 limit. bpf_semi_rand_get() was too (un)?lucky. k = 0; was missing in bpf_fill_scale2. It was testing a bit shorter sequence of jumps than intended. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/test_verifier.c | 31 +++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 8eb6317f4038..3e0502af3307 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -210,33 +210,35 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self) self->retval = (uint32_t)res; } -/* test the sequence of 1k jumps */ +#define MAX_JMP_SEQ 8192 + +/* test the sequence of 8k jumps */ static void bpf_fill_scale1(struct bpf_test *self) { struct bpf_insn *insn = self->fill_insns; int i = 0, k = 0; insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); - /* test to check that the sequence of 1024 jumps is acceptable */ - while (k++ < 1024) { + /* test to check that the long sequence of jumps is acceptable */ + while (k++ < MAX_JMP_SEQ) { insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32); - insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2); + insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2); insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10); insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, -8 * (k % 64 + 1)); } - /* every jump adds 1024 steps to insn_processed, so to stay exactly - * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT + /* every jump adds 1 step to insn_processed, so to stay exactly + * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT */ - while (i < MAX_TEST_INSNS - 1025) + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42); insn[i] = BPF_EXIT_INSN(); self->prog_len = i + 1; self->retval = 42; } -/* test the sequence of 1k jumps in inner most function (function depth 8)*/ +/* test the sequence of 8k jumps in inner most function (function depth 8)*/ static void bpf_fill_scale2(struct bpf_test *self) { struct bpf_insn *insn = self->fill_insns; @@ -248,19 +250,20 @@ static void bpf_fill_scale2(struct bpf_test *self) insn[i++] = BPF_EXIT_INSN(); } insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); - /* test to check that the sequence of 1024 jumps is acceptable */ - while (k++ < 1024) { + /* test to check that the long sequence of jumps is acceptable */ + k = 0; + while (k++ < MAX_JMP_SEQ) { insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32); - insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2); + insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2); insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10); insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, -8 * (k % (64 - 4 * FUNC_NEST) + 1)); } - /* every jump adds 1024 steps to insn_processed, so to stay exactly - * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT + /* every jump adds 1 step to insn_processed, so to stay exactly + * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT */ - while (i < MAX_TEST_INSNS - 1025) + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42); insn[i] = BPF_EXIT_INSN(); self->prog_len = i + 1; -- Gitee From becd6a2f7812d8d74a8d7d33724cd3fcb79f784e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:21 -0700 Subject: [PATCH 22/26] selftests/bpf: add pyperf scale test ANBZ: #5530 commit 7c9441066ab53168093c79477aabd575f7c14129 upstream. Add a snippet of pyperf bpf program used to collect python stack traces as a scale test for the verifier. At 189 loop iterations llvm 9.0 starts ignoring '#pragma unroll' and generates partially unrolled loop instead. Hence use 50, 100, and 180 loop iterations to stress test. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/pyperf.h | 268 ++++++++++++++++++ tools/testing/selftests/bpf/progs/pyperf100.c | 4 + tools/testing/selftests/bpf/progs/pyperf180.c | 4 + tools/testing/selftests/bpf/progs/pyperf50.c | 4 + tools/testing/selftests/bpf/test_progs.c | 31 +- 5 files changed, 298 insertions(+), 13 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/pyperf.h create mode 100644 tools/testing/selftests/bpf/progs/pyperf100.c create mode 100644 tools/testing/selftests/bpf/progs/pyperf180.c create mode 100644 tools/testing/selftests/bpf/progs/pyperf50.c diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h new file mode 100644 index 000000000000..0cc5e4ee90bd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define FUNCTION_NAME_LEN 64 +#define FILE_NAME_LEN 128 +#define TASK_COMM_LEN 16 + +typedef struct { + int PyThreadState_frame; + int PyThreadState_thread; + int PyFrameObject_back; + int PyFrameObject_code; + int PyFrameObject_lineno; + int PyCodeObject_filename; + int PyCodeObject_name; + int String_data; + int String_size; +} OffsetConfig; + +typedef struct { + uintptr_t current_state_addr; + uintptr_t tls_key_addr; + OffsetConfig offsets; + bool use_tls; +} PidData; + +typedef struct { + uint32_t success; +} Stats; + +typedef struct { + char name[FUNCTION_NAME_LEN]; + char file[FILE_NAME_LEN]; +} Symbol; + +typedef struct { + uint32_t pid; + uint32_t tid; + char comm[TASK_COMM_LEN]; + int32_t kernel_stack_id; + int32_t user_stack_id; + bool thread_current; + bool pthread_match; + bool stack_complete; + int16_t stack_len; + int32_t stack[STACK_MAX_LEN]; + + int has_meta; + int metadata; + char dummy_safeguard; +} Event; + + +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; +}; + +typedef int pid_t; + +typedef struct { + void* f_back; // PyFrameObject.f_back, previous frame + void* f_code; // PyFrameObject.f_code, pointer to PyCodeObject + void* co_filename; // PyCodeObject.co_filename + void* co_name; // PyCodeObject.co_name +} FrameData; + +static inline __attribute__((__always_inline__)) void* +get_thread_state(void* tls_base, PidData* pidData) +{ + void* thread_state; + int key; + + bpf_probe_read(&key, sizeof(key), (void*)(long)pidData->tls_key_addr); + bpf_probe_read(&thread_state, sizeof(thread_state), + tls_base + 0x310 + key * 0x10 + 0x08); + return thread_state; +} + +static inline __attribute__((__always_inline__)) bool +get_frame_data(void* frame_ptr, PidData* pidData, FrameData* frame, Symbol* symbol) +{ + // read data from PyFrameObject + bpf_probe_read(&frame->f_back, + sizeof(frame->f_back), + frame_ptr + pidData->offsets.PyFrameObject_back); + bpf_probe_read(&frame->f_code, + sizeof(frame->f_code), + frame_ptr + pidData->offsets.PyFrameObject_code); + + // read data from PyCodeObject + if (!frame->f_code) + return false; + bpf_probe_read(&frame->co_filename, + sizeof(frame->co_filename), + frame->f_code + pidData->offsets.PyCodeObject_filename); + bpf_probe_read(&frame->co_name, + sizeof(frame->co_name), + frame->f_code + pidData->offsets.PyCodeObject_name); + // read actual names into symbol + if (frame->co_filename) + bpf_probe_read_str(&symbol->file, + sizeof(symbol->file), + frame->co_filename + pidData->offsets.String_data); + if (frame->co_name) + bpf_probe_read_str(&symbol->name, + sizeof(symbol->name), + frame->co_name + pidData->offsets.String_data); + return true; +} + +struct bpf_elf_map SEC("maps") pidmap = { + .type = BPF_MAP_TYPE_HASH, + .size_key = sizeof(int), + .size_value = sizeof(PidData), + .max_elem = 1, +}; + +struct bpf_elf_map SEC("maps") eventmap = { + .type = BPF_MAP_TYPE_HASH, + .size_key = sizeof(int), + .size_value = sizeof(Event), + .max_elem = 1, +}; + +struct bpf_elf_map SEC("maps") symbolmap = { + .type = BPF_MAP_TYPE_HASH, + .size_key = sizeof(Symbol), + .size_value = sizeof(int), + .max_elem = 1, +}; + +struct bpf_elf_map SEC("maps") statsmap = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(Stats), + .size_value = sizeof(int), + .max_elem = 1, +}; + +struct bpf_elf_map SEC("maps") perfmap = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .max_elem = 32, +}; + +struct bpf_elf_map SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .size_key = sizeof(int), + .size_value = sizeof(long long) * 127, + .max_elem = 1000, +}; + +static inline __attribute__((__always_inline__)) int __on_event(struct pt_regs *ctx) +{ + uint64_t pid_tgid = bpf_get_current_pid_tgid(); + pid_t pid = (pid_t)(pid_tgid >> 32); + PidData* pidData = bpf_map_lookup_elem(&pidmap, &pid); + if (!pidData) + return 0; + + int zero = 0; + Event* event = bpf_map_lookup_elem(&eventmap, &zero); + if (!event) + return 0; + + event->pid = pid; + + event->tid = (pid_t)pid_tgid; + bpf_get_current_comm(&event->comm, sizeof(event->comm)); + + event->user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); + event->kernel_stack_id = bpf_get_stackid(ctx, &stackmap, 0); + + void* thread_state_current = (void*)0; + bpf_probe_read(&thread_state_current, + sizeof(thread_state_current), + (void*)(long)pidData->current_state_addr); + + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); + void* tls_base = (void*)task; + + void* thread_state = pidData->use_tls ? get_thread_state(tls_base, pidData) + : thread_state_current; + event->thread_current = thread_state == thread_state_current; + + if (pidData->use_tls) { + uint64_t pthread_created; + uint64_t pthread_self; + bpf_probe_read(&pthread_self, sizeof(pthread_self), tls_base + 0x10); + + bpf_probe_read(&pthread_created, + sizeof(pthread_created), + thread_state + pidData->offsets.PyThreadState_thread); + event->pthread_match = pthread_created == pthread_self; + } else { + event->pthread_match = 1; + } + + if (event->pthread_match || !pidData->use_tls) { + void* frame_ptr; + FrameData frame; + Symbol sym = {}; + int cur_cpu = bpf_get_smp_processor_id(); + + bpf_probe_read(&frame_ptr, + sizeof(frame_ptr), + thread_state + pidData->offsets.PyThreadState_frame); + + int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym); + if (symbol_counter == NULL) + return 0; +#pragma unroll + /* Unwind python stack */ + for (int i = 0; i < STACK_MAX_LEN; ++i) { + if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) { + int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu; + int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, &sym); + if (!symbol_id) { + bpf_map_update_elem(&symbolmap, &sym, &zero, 0); + symbol_id = bpf_map_lookup_elem(&symbolmap, &sym); + if (!symbol_id) + return 0; + } + if (*symbol_id == new_symbol_id) + (*symbol_counter)++; + event->stack[i] = *symbol_id; + event->stack_len = i + 1; + frame_ptr = frame.f_back; + } + } + event->stack_complete = frame_ptr == NULL; + } else { + event->stack_complete = 1; + } + + Stats* stats = bpf_map_lookup_elem(&statsmap, &zero); + if (stats) + stats->success++; + + event->has_meta = 0; + bpf_perf_event_output(ctx, &perfmap, 0, event, offsetof(Event, metadata)); + return 0; +} + +SEC("raw_tracepoint/kfree_skb") +int on_event(struct pt_regs* ctx) +{ + int i, ret = 0; + ret |= __on_event(ctx); + ret |= __on_event(ctx); + ret |= __on_event(ctx); + ret |= __on_event(ctx); + ret |= __on_event(ctx); + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/pyperf100.c b/tools/testing/selftests/bpf/progs/pyperf100.c new file mode 100644 index 000000000000..29786325db54 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf100.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook +#define STACK_MAX_LEN 100 +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c new file mode 100644 index 000000000000..c39f559d3100 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf180.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook +#define STACK_MAX_LEN 180 +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/pyperf50.c b/tools/testing/selftests/bpf/progs/pyperf50.c new file mode 100644 index 000000000000..ef7ce340a292 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf50.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook +#define STACK_MAX_LEN 50 +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f1ed62dda796..ab69a38ff8bd 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -2381,7 +2381,7 @@ static int libbpf_debug_print_verifier_scale(enum libbpf_print_level level, return vfprintf(stderr, "%s", args); } -static int check_load(const char *file) +static int check_load(const char *file, enum bpf_prog_type type) { struct bpf_prog_load_attr attr; struct bpf_object *obj; @@ -2389,7 +2389,7 @@ static int check_load(const char *file) memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); attr.file = file; - attr.prog_type = BPF_PROG_TYPE_SCHED_CLS; + attr.prog_type = type; attr.log_level = 4; err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); bpf_object__close(obj); @@ -2400,21 +2400,26 @@ static int check_load(const char *file) void test_bpf_verif_scale(void) { - const char *file1 = "./test_verif_scale1.o"; - const char *file2 = "./test_verif_scale2.o"; - const char *file3 = "./test_verif_scale3.o"; - int err; + const char *scale[] = { + "./test_verif_scale1.o", "./test_verif_scale2.o", "./test_verif_scale3.o" + }; + const char *pyperf[] = { + "./pyperf50.o", "./pyperf100.o", "./pyperf180.o" + }; + int err, i; if (verifier_stats) libbpf_set_print(libbpf_debug_print_verifier_scale); - err = check_load(file1); - err |= check_load(file2); - err |= check_load(file3); - if (!err) - printf("test_verif_scale:OK\n"); - else - printf("test_verif_scale:FAIL\n"); + for (i = 0; i < ARRAY_SIZE(scale); i++) { + err = check_load(scale[i], BPF_PROG_TYPE_SCHED_CLS); + printf("test_scale:%s:%s\n", scale[i], err ? "FAIL" : "OK"); + } + + for (i = 0; i < ARRAY_SIZE(pyperf); i++) { + err = check_load(pyperf[i], BPF_PROG_TYPE_RAW_TRACEPOINT); + printf("test_scale:%s:%s\n", pyperf[i], err ? "FAIL" : "OK"); + } } static void test_global_data_number(struct bpf_object *obj, __u32 duration) -- Gitee From a6d17e3d018136f505d893dff244c265f1257327 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:05 -0700 Subject: [PATCH 23/26] bpf: cleanup explored_states ANBZ: #5530 commit 5d839021675a2e1b76653189cc6a90cfd8e30a69 upstream. clean up explored_states to prep for introduction of hashtable No functional changes. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/verifier.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5cc8113072b8..6d155fd94061 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5605,6 +5605,17 @@ enum { }; #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + return &env->explored_states[idx]; +} + +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->explored_states[idx] = STATE_LIST_MARK; +} /* t, w, e - match pseudo-code above: * t - index of current instruction @@ -5630,7 +5641,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ @@ -5698,9 +5709,9 @@ static int check_cfg(struct bpf_verifier_env *env) else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); if (ret == 1) goto peek_stack; @@ -5723,10 +5734,10 @@ static int check_cfg(struct bpf_verifier_env *env) * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -6174,7 +6185,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; + sl = *explored_state(env, insn); if (!sl) return; @@ -6533,7 +6544,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = &env->explored_states[insn_idx]; + pprev = explored_state(env, insn_idx); sl = *pprev; if (!sl) @@ -6620,8 +6631,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just -- Gitee From 59ac6489278b5d8893bcdff78feaab586cfed4f0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:06 -0700 Subject: [PATCH 24/26] bpf: split explored_states ANBZ: #5530 commit a8f500af0ccffc3d2aaf9018537981cb173865a1 upstream. split explored_states into prune_point boolean mark and link list of explored states. This removes STATE_LIST_MARK hack and allows marks to be separate from states. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 31 +++++++++++++------------------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 346a1dc18218..e0c8717c776f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -234,6 +234,7 @@ struct bpf_insn_aux_data { int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ u8 alu_state; /* used in combination with alu_limit */ + bool prune_point; unsigned int orig_idx; /* original instruction index */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d155fd94061..85975564943e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5604,7 +5604,6 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) @@ -5614,7 +5613,7 @@ static struct bpf_verifier_state_list **explored_state( static void init_explored_state(struct bpf_verifier_env *env, int idx) { - env->explored_states[idx] = STATE_LIST_MARK; + env->insn_aux_data[idx].prune_point = true; } /* t, w, e - match pseudo-code above: @@ -6186,10 +6185,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, int i; sl = *explored_state(env, insn); - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { + while (sl) { if (sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) @@ -6544,18 +6540,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = explored_state(env, insn_idx); - sl = *pprev; - - if (!sl) + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -8318,13 +8314,12 @@ static void free_states(struct bpf_verifier_env *env) for (i = 0; i < env->prog->len; i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } kvfree(env->explored_states); -- Gitee From 9a98728a085838a2674faa53670f0f190683d892 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:07 -0700 Subject: [PATCH 25/26] bpf: convert explored_states to hash table ANBZ: #5530 commit dc2a4ebc0b44a212fcf72242210e56aa17e7317b upstream. All prune points inside a callee bpf function most likely will have different callsites. For example, if function foo() is called from two callsites the half of explored states in all prune points in foo() will be useless for subsequent walking of one of those callsites. Fortunately explored_states pruning heuristics keeps the number of states per prune point small, but walking these states is still a waste of cpu time when the callsite of the current state is different from the callsite of the explored state. To improve pruning logic convert explored_states into hash table and use simple insn_idx ^ callsite hash to select hash bucket. This optimization has no effect on programs without bpf2bpf calls and drastically improves programs with calls. In the later case it reduces total memory consumption in 1M scale tests by almost 3 times (peak_states drops from 5752 to 2016). Care should be taken when comparing the states for equivalency. Since the same hash bucket can now contain states with different indices the insn_idx has to be part of verifier_state and compared. Different hash table sizes and different hash functions were explored, but the results were not significantly better vs this patch. They can be improved in the future. Hit/miss heuristic is not counting index miscompare as a miss. Otherwise verifier stats become unstable when experimenting with different hash functions. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e0c8717c776f..e590137e7f27 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -187,6 +187,7 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + u32 insn_idx; u32 curframe; u32 active_spin_lock; bool speculative; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85975564943e..bff1e01282e7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5604,11 +5604,19 @@ enum { BRANCH = 2, }; +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) { - return &env->explored_states[idx]; + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } static void init_explored_state(struct bpf_verifier_env *env, int idx) @@ -6186,7 +6194,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { - if (sl->state.curframe != cur->curframe) + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -6552,6 +6561,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6569,7 +6581,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - states_cnt++; sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. @@ -6596,6 +6607,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) sl = *pprev; continue; } +next: pprev = &sl->next; sl = *pprev; } @@ -6627,6 +6639,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } + new->insn_idx = insn_idx; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -8311,7 +8324,7 @@ static void free_states(struct bpf_verifier_env *env) if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; while (sl) { @@ -8419,7 +8432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kvcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; -- Gitee From 32a89df0c03a6e8361806bd1a814beb520c7d166 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 22 May 2019 10:51:28 -0700 Subject: [PATCH 26/26] libbpf: emit diff of mismatched public API, if any ANBZ: #5530 commit 9efc7794496d531593751a3fb008030d5f9ba87c upstream. It's easy to have a mismatch of "intended to be public" vs really exposed API functions. While Makefile does check for this mismatch, if it actually occurs it's not trivial to determine which functions are accidentally exposed. This patch dumps out a diff showing what's not supposed to be exposed facilitating easier fixing. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/lib/bpf/Makefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 079e3f2b1560..23879feef1e8 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -204,6 +204,16 @@ check_abi: $(OUTPUT)libbpf.so "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \ "Please make sure all LIBBPF_API symbols are" \ "versioned in $(VERSION_SCRIPT)." >&2; \ + readelf -s --wide $(OUTPUT)libbpf-in.o | \ + awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'| \ + sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ + readelf -s --wide $(OUTPUT)libbpf.so | \ + grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | \ + sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; \ + diff -u $(OUTPUT)libbpf_global_syms.tmp \ + $(OUTPUT)libbpf_versioned_syms.tmp; \ + rm $(OUTPUT)libbpf_global_syms.tmp \ + $(OUTPUT)libbpf_versioned_syms.tmp; \ exit 1; \ fi -- Gitee