From bcca15085cc71a7fddee1b9165eb33eecd400e0c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Jul 2019 14:47:53 -0700 Subject: [PATCH 01/26] libbpf: silence GCC8 warning about string truncation ANBZ: #5530 commit cb8ffde5694ae5fffb456eae932aac442aa3a207 upstream. Despite a proper NULL-termination after strncpy(..., ..., IFNAMSIZ - 1), GCC8 still complains about *expected* string truncation: xsk.c:330:2: error: 'strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Werror=stringop-truncation] strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ - 1); This patch gets rid of the issue altogether by using memcpy instead. There is no performance regression, as strncpy will still copy and fill all of the bytes anyway. v1->v2: - rebase against bpf tree. Cc: Magnus Karlsson Signed-off-by: Andrii Nakryiko Acked-by: Magnus Karlsson Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/lib/bpf/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index f5357230e16c..a2b002964267 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -326,7 +326,7 @@ static int xsk_get_max_queues(struct xsk_socket *xsk) return -errno; ifr.ifr_data = (void *)&channels; - strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ - 1); + memcpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ - 1); ifr.ifr_name[IFNAMSIZ - 1] = '\0'; err = ioctl(fd, SIOCETHTOOL, &ifr); if (err && errno != EOPNOTSUPP) { @@ -519,7 +519,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, err = -errno; goto out_socket; } - strncpy(xsk->ifname, ifname, IFNAMSIZ - 1); + memcpy(xsk->ifname, ifname, IFNAMSIZ - 1); xsk->ifname[IFNAMSIZ - 1] = '\0'; err = xsk_set_xdp_socket_config(&xsk->config, usr_config); -- Gitee From 54a5fd4160fe46d1b670dc17baaea324d1f55939 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 31 May 2019 14:05:06 -0700 Subject: [PATCH 02/26] flow_dissector: remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag ANBZ: #5530 commit 1cc26450a855aa35a6d515be14c539944d5f9648 upstream. This flag is not used by any caller, remove it. [backport note] @flags was not introduced thus ignore it. Signed-off-by: Stanislav Fomichev Signed-off-by: David S. Miller Signed-off-by: Yuanhe Shu --- include/net/flow_dissector.h | 5 ++--- net/core/flow_dissector.c | 8 -------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 468a7df5ca80..068e17ce86d1 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -230,9 +230,8 @@ enum flow_dissector_key_id { }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) -#define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) -#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) -#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(3) +#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) +#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 09db923f398f..43dcb2b33e6e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -920,11 +920,6 @@ bool __skb_flow_dissect(const struct net *net, __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; - } - break; } case htons(ETH_P_IPV6): { @@ -973,9 +968,6 @@ bool __skb_flow_dissect(const struct net *net, __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; } case htons(ETH_P_8021AD): -- Gitee From 3a9f356f1c103f43e97aa50d3a9be69d5d7aba4b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:25 -0700 Subject: [PATCH 03/26] bpf/flow_dissector: pass input flags to BPF flow dissector program ANBZ: #5530 commit 086f95682114fd2d1790bd3226e76cbae9a2d192 upstream. C flow dissector supports input flags that tell it to customize parsing by either stopping early or trying to parse as deep as possible. Pass those flags to the BPF flow dissector so it can make the same decisions. In the next commits I'll add support for those flags to our reference bpf_flow.c v3: * Export copy of flow dissector flags instead of moving (Alexei Starovoitov) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/skbuff.h | 2 +- include/uapi/linux/bpf.h | 5 +++++ net/bpf/test_run.c | 2 +- net/core/flow_dissector.c | 12 ++++++++++-- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b32df2de5993..5875a6e6c6d6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1243,7 +1243,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen); + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57f96a592ad5..778ea2e223f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3485,6 +3485,10 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) + struct bpf_flow_keys { __u16 nhoff; __u16 thoff; @@ -3506,6 +3510,7 @@ struct bpf_flow_keys { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; }; + __u32 flags; }; struct bpf_func_info { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 33e0dc168c16..4df2b2ae0759 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -422,7 +422,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size); + size, 0); if (signal_pending(current)) { preempt_enable(); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 43dcb2b33e6e..3540d3afe50b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -723,7 +723,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen) + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -734,6 +734,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + preempt_disable(); result = BPF_PROG_RUN(prog, ctx); preempt_enable(); @@ -852,7 +860,7 @@ bool __skb_flow_dissect(const struct net *net, } ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, - hlen); + hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); -- Gitee From 21347fea35f4c958723ddebff13888a71b351845 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:26 -0700 Subject: [PATCH 04/26] bpf/flow_dissector: document flags ANBZ: #5530 commit 1ac6b126dbe8108c1fe6e29d023a0f2989cd3fea upstream. Describe what each input flag does and who uses it. Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../networking/bpf_flow_dissector.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/networking/bpf_flow_dissector.rst b/Documentation/networking/bpf_flow_dissector.rst index b375ae2ec2c4..b12a8dfbc369 100644 --- a/Documentation/networking/bpf_flow_dissector.rst +++ b/Documentation/networking/bpf_flow_dissector.rst @@ -26,6 +26,7 @@ The inputs are: * ``nhoff`` - initial offset of the networking header * ``thoff`` - initial offset of the transport header, initialized to nhoff * ``n_proto`` - L3 protocol type, parsed out of L2 header + * ``flags`` - optional flags Flow dissector BPF program should fill out the rest of the ``struct bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be @@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to handle both cases. +Flags +===== + +``flow_keys->flags`` might contain optional input flags that work as follows: + +* ``BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG`` - tells BPF flow dissector to + continue parsing first fragment; the default expected behavior is that + flow dissector returns as soon as it finds out that the packet is fragmented; + used by ``eth_get_headlen`` to estimate length of all headers for GRO. +* ``BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL`` - tells BPF flow dissector to + stop parsing as soon as it reaches IPv6 flow label; used by + ``___skb_get_hash`` and ``__skb_get_hash_symmetric`` to get flow hash. +* ``BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP`` - tells BPF flow dissector to stop + parsing as soon as it reaches encapsulated headers; used by routing + infrastructure. + + Reference Implementation ======================== -- Gitee From c7d908c1d20a5ffb7320e8d1c73bb533f2dfe815 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:27 -0700 Subject: [PATCH 05/26] bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN ANBZ: #5530 commit b2ca4e1cfa7d3d755e1ec637d1235f89af9bd01f upstream. This will allow us to write tests for those flags. v2: * Swap kfree(data) and kfree(user_ctx) (Song Liu) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- net/bpf/test_run.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4df2b2ae0759..c6f8408fc3bf 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -380,6 +380,22 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, return ret; } +static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) +{ + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) + return -EINVAL; + + /* flags is allowed */ + + if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) + + FIELD_SIZEOF(struct bpf_flow_keys, flags), + sizeof(struct bpf_flow_keys))) + return -EINVAL; + + return 0; +} + int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) @@ -387,9 +403,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; + struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; const struct ethhdr *eth; + unsigned int flags = 0; u32 retval, duration; void *data; int ret; @@ -398,9 +416,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; - if (kattr->test.ctx_in || kattr->test.ctx_out) - return -EINVAL; - if (size < ETH_HLEN) return -EINVAL; @@ -413,6 +428,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (!repeat) repeat = 1; + user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); + if (IS_ERR(user_ctx)) { + kfree(data); + return PTR_ERR(user_ctx); + } + if (user_ctx) { + ret = verify_user_bpf_flow_keys(user_ctx); + if (ret) + goto out; + flags = user_ctx->flags; + } + ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; @@ -422,7 +449,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size, 0); + size, flags); if (signal_pending(current)) { preempt_enable(); @@ -453,8 +480,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); out: + kfree(user_ctx); kfree(data); return ret; } -- Gitee From b5fc8cbb088c82282256530330802e68df469dcb Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:28 -0700 Subject: [PATCH 06/26] tools/bpf: sync bpf_flow_keys flags ANBZ: #5530 commit 57debff23c4cd47f25850a5f42a903aebe535e95 upstream. Export bpf_flow_keys flags to tools/libbpf/selftests. Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5ac3af15dc94..b2812c9cae50 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3482,6 +3482,10 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) + struct bpf_flow_keys { __u16 nhoff; __u16 thoff; @@ -3503,6 +3507,7 @@ struct bpf_flow_keys { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; }; + __u32 flags; }; struct bpf_func_info { -- Gitee From 0c7331e8f685e5775af9c753a6ef035b0e29a761 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:29 -0700 Subject: [PATCH 07/26] selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG ANBZ: #5530 commit ae173a915785e55574c1fc54edf58b9b87b28c22 upstream. bpf_flow.c: exit early unless BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG is passed in flags. Also, set ip_proto earlier, this makes sure we have correct value with fragmented packets. Add selftest cases to test ipv4/ipv6 fragments and skip eth_get_headlen tests that don't have BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag. eth_get_headlen calls flow dissector with BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag so we can't run tests that have different set of input flags against it. v2: * sefltests -> selftests (Willem de Bruijn) * Reword a comment about eth_get_headlen flags (Song Liu) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/bpf_flow.c | 29 +++- tools/testing/selftests/bpf/test_progs.c | 133 ++++++++++++++++++- 2 files changed, 155 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 12a50a707149..7a2722ef47db 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -152,7 +152,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) struct tcphdr *tcp, _tcp; struct udphdr *udp, _udp; - keys->ip_proto = proto; switch (proto) { case IPPROTO_ICMP: icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp); @@ -230,7 +229,6 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr) { struct bpf_flow_keys *keys = skb->flow_keys; - keys->ip_proto = nexthdr; switch (nexthdr) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: @@ -265,6 +263,7 @@ PROG(IP)(struct __sk_buff *skb) keys->addr_proto = ETH_P_IP; keys->ipv4_src = iph->saddr; keys->ipv4_dst = iph->daddr; + keys->ip_proto = iph->protocol; keys->thoff += iph->ihl << 2; if (data + keys->thoff > data_end) @@ -272,13 +271,20 @@ PROG(IP)(struct __sk_buff *skb) if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) { keys->is_frag = true; - if (iph->frag_off & bpf_htons(IP_OFFSET)) + if (iph->frag_off & bpf_htons(IP_OFFSET)) { /* From second fragment on, packets do not have headers * we can parse. */ done = true; - else + } else { keys->is_first_frag = true; + /* No need to parse fragmented packet unless + * explicitly asked for. + */ + if (!(keys->flags & + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + done = true; + } } if (done) @@ -300,6 +306,7 @@ PROG(IPV6)(struct __sk_buff *skb) memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr)); keys->thoff += sizeof(struct ipv6hdr); + keys->ip_proto = ip6h->nexthdr; return parse_ipv6_proto(skb, ip6h->nexthdr); } @@ -316,7 +323,8 @@ PROG(IPV6OP)(struct __sk_buff *skb) /* hlen is in 8-octets and does not include the first 8 bytes * of the header */ - skb->flow_keys->thoff += (1 + ip6h->hdrlen) << 3; + keys->thoff += (1 + ip6h->hdrlen) << 3; + keys->ip_proto = ip6h->nexthdr; return parse_ipv6_proto(skb, ip6h->nexthdr); } @@ -332,9 +340,18 @@ PROG(IPV6FR)(struct __sk_buff *skb) keys->thoff += sizeof(*fragh); keys->is_frag = true; - if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) + keys->ip_proto = fragh->nexthdr; + + if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) { keys->is_first_frag = true; + /* No need to parse fragmented packet unless + * explicitly asked for. + */ + if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + return export_flow_keys(keys, BPF_OK); + } + return parse_ipv6_proto(skb, fragh->nexthdr); } diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index bbe72073daee..91622e46b527 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -64,6 +64,10 @@ bool verifier_stats = false; #define SYS_KPROBE_NAME "sys_nanosleep" #endif +#ifndef IP_MF +#define IP_MF 0x2000 +#endif + /* ipv4 test vector */ static struct { struct ethhdr eth; @@ -1986,6 +1990,18 @@ struct ipv6_pkt { struct tcphdr tcp; } __packed; +struct ipv6_frag_pkt { + struct ethhdr eth; + struct ipv6hdr iph; + struct frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; + } ipf; + struct tcphdr tcp; +} __packed; + struct dvlan_ipv6_pkt { struct ethhdr eth; __u16 vlan_tci; @@ -2002,9 +2018,11 @@ struct test { struct ipv4_pkt ipv4; struct svlan_ipv4_pkt svlan_ipv4; struct ipv6_pkt ipv6; + struct ipv6_frag_pkt ipv6_frag; struct dvlan_ipv6_pkt dvlan_ipv6; } pkt; struct bpf_flow_keys keys; + __u32 flags; }; #define VLAN_HLEN 4 @@ -2096,6 +2114,102 @@ struct test tests[] = { .dport = 8080, }, }, + { + .name = "ipv4-frag", + .pkt.ipv4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.frag_off = __bpf_constant_htons(IP_MF), + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IP), + .is_frag = true, + .is_first_frag = true, + .sport = 80, + .dport = 8080, + }, + .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + }, + { + .name = "ipv4-no-frag", + .pkt.ipv4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.frag_off = __bpf_constant_htons(IP_MF), + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IP), + .is_frag = true, + .is_first_frag = true, + }, + }, + { + .name = "ipv6-frag", + .pkt.ipv6_frag = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_FRAGMENT, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .ipf.nexthdr = IPPROTO_TCP, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr) + + sizeof(struct frag_hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .is_frag = true, + .is_first_frag = true, + .sport = 80, + .dport = 8080, + }, + .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + }, + { + .name = "ipv6-no-frag", + .pkt.ipv6_frag = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_FRAGMENT, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .ipf.nexthdr = IPPROTO_TCP, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr) + + sizeof(struct frag_hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .is_frag = true, + .is_first_frag = true, + }, + }, }; static int create_tap(const char *ifname) @@ -2178,6 +2292,13 @@ static void test_flow_dissector(void) .data_size_in = sizeof(tests[i].pkt), .data_out = &flow_keys, }; + static struct bpf_flow_keys ctx = {}; + + if (tests[i].flags) { + tattr.ctx_in = &ctx; + tattr.ctx_size_in = sizeof(ctx); + ctx.flags = tests[i].flags; + } err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) || @@ -2204,11 +2325,21 @@ static void test_flow_dissector(void) CHECK(err, "ifup", "err %d errno %d\n", err, errno); for (i = 0; i < ARRAY_SIZE(tests); i++) { - struct bpf_flow_keys flow_keys = {}; + /* Keep in sync with 'flags' from eth_get_headlen. */ + __u32 eth_get_headlen_flags = + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; struct bpf_prog_test_run_attr tattr = {}; + struct bpf_flow_keys flow_keys = {}; __u32 key = (__u32)(tests[i].keys.sport) << 16 | tests[i].keys.dport; + /* For skb-less case we can't pass input flags; run + * only the tests that have a matching set of flags. + */ + + if (tests[i].flags != eth_get_headlen_flags) + continue; + err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); -- Gitee From 1d7efc9304ecd921a14bbf353f807a950eb24682 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:30 -0700 Subject: [PATCH 08/26] bpf/flow_dissector: support ipv6 flow_label and BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL ANBZ: #5530 commit 71c99e32b926159ea628352751f66383d7d04d17 upstream. Add support for exporting ipv6 flow label via bpf_flow_keys. Export flow label from bpf_flow.c and also return early when BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL is passed. Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/uapi/linux/bpf.h | 1 + net/core/flow_dissector.c | 9 ++++ tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/progs/bpf_flow.c | 10 +++++ tools/testing/selftests/bpf/test_progs.c | 46 ++++++++++++++++++++ 5 files changed, 67 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 778ea2e223f8..36b37da0f02c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3511,6 +3511,7 @@ struct bpf_flow_keys { }; }; __u32 flags; + __be32 flow_label; }; struct bpf_func_info { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3540d3afe50b..c3228517c4f3 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -676,6 +676,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, @@ -720,6 +721,14 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b2812c9cae50..79f88e532a71 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3508,6 +3508,7 @@ struct bpf_flow_keys { }; }; __u32 flags; + __be32 flow_label; }; struct bpf_func_info { diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 7a2722ef47db..d4044cded431 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -82,6 +82,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys, return ret; } +#define IPV6_FLOWLABEL_MASK __bpf_constant_htonl(0x000FFFFF) +static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) +{ + return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; +} + static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, __u16 hdr_size, void *buffer) @@ -307,6 +313,10 @@ PROG(IPV6)(struct __sk_buff *skb) keys->thoff += sizeof(struct ipv6hdr); keys->ip_proto = ip6h->nexthdr; + keys->flow_label = ip6_flowlabel(ip6h); + + if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + return export_flow_keys(keys, BPF_OK); return parse_ipv6_proto(skb, ip6h->nexthdr); } diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 91622e46b527..c510405fb256 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1957,6 +1957,7 @@ static void test_queue_stack_map(int type) "is_encap=%u/%u " \ "ip_proto=0x%x/0x%x " \ "n_proto=0x%x/0x%x " \ + "flow_label=0x%x/0x%x " \ "sport=%u/%u " \ "dport=%u/%u\n", \ got.nhoff, expected.nhoff, \ @@ -1967,6 +1968,7 @@ static void test_queue_stack_map(int type) got.is_encap, expected.is_encap, \ got.ip_proto, expected.ip_proto, \ got.n_proto, expected.n_proto, \ + got.flow_label, expected.flow_label, \ got.sport, expected.sport, \ got.dport, expected.dport) @@ -2210,6 +2212,50 @@ struct test tests[] = { .is_first_frag = true, }, }, + { + .name = "ipv6-flow-label", + .pkt.ipv6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.flow_lbl = { 0xb, 0xee, 0xef }, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .sport = 80, + .dport = 8080, + .flow_label = __bpf_constant_htonl(0xbeeef), + }, + }, + { + .name = "ipv6-no-flow-label", + .pkt.ipv6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.flow_lbl = { 0xb, 0xee, 0xef }, + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), + .flow_label = __bpf_constant_htonl(0xbeeef), + }, + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + }, }; static int create_tap(const char *ifname) -- Gitee From d002800bba7f82538a7fc8ba78746d9497d1dea1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:31 -0700 Subject: [PATCH 09/26] selftests/bpf: support BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP ANBZ: #5530 commit e853ae776a58633492b59badab04f53a6b730d62 upstream. Exit as soon as we found that packet is encapped when BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP is passed. Add appropriate selftest cases. v2: * Subtract sizeof(struct iphdr) from .iph_inner.tot_len (Willem de Bruijn) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/bpf_flow.c | 8 +++ tools/testing/selftests/bpf/test_progs.c | 65 ++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index d4044cded431..040a44206f29 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -166,9 +166,15 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) return export_flow_keys(keys, BPF_OK); case IPPROTO_IPIP: keys->is_encap = true; + if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return export_flow_keys(keys, BPF_OK); + return parse_eth_proto(skb, bpf_htons(ETH_P_IP)); case IPPROTO_IPV6: keys->is_encap = true; + if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return export_flow_keys(keys, BPF_OK); + return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6)); case IPPROTO_GRE: gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre); @@ -188,6 +194,8 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) keys->thoff += 4; /* Step over sequence number */ keys->is_encap = true; + if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return export_flow_keys(keys, BPF_OK); if (gre->proto == bpf_htons(ETH_P_TEB)) { eth = bpf_flow_dissect_get_header(skb, sizeof(*eth), diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index c510405fb256..6bb6d4ca3e16 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1978,6 +1978,13 @@ struct ipv4_pkt { struct tcphdr tcp; } __packed; +struct ipip_pkt { + struct ethhdr eth; + struct iphdr iph; + struct iphdr iph_inner; + struct tcphdr tcp; +} __packed; + struct svlan_ipv4_pkt { struct ethhdr eth; __u16 vlan_tci; @@ -2019,6 +2026,7 @@ struct test { union { struct ipv4_pkt ipv4; struct svlan_ipv4_pkt svlan_ipv4; + struct ipip_pkt ipip; struct ipv6_pkt ipv6; struct ipv6_frag_pkt ipv6_frag; struct dvlan_ipv6_pkt dvlan_ipv6; @@ -2256,6 +2264,63 @@ struct test tests[] = { }, .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, }, + { + .name = "ipip-encap", + .pkt.ipip = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_IPIP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph_inner.ihl = 5, + .iph_inner.protocol = IPPROTO_TCP, + .iph_inner.tot_len = + __bpf_constant_htons(MAGIC_BYTES) - + sizeof(struct iphdr), + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .nhoff = 0, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct iphdr) + + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IP), + .is_encap = true, + .sport = 80, + .dport = 8080, + }, + }, + { + .name = "ipip-no-encap", + .pkt.ipip = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_IPIP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph_inner.ihl = 5, + .iph_inner.protocol = IPPROTO_TCP, + .iph_inner.tot_len = + __bpf_constant_htons(MAGIC_BYTES) - + sizeof(struct iphdr), + .tcp.doff = 5, + .tcp.source = 80, + .tcp.dest = 8080, + }, + .keys = { + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP, + .nhoff = ETH_HLEN, + .thoff = ETH_HLEN + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_IPIP, + .n_proto = __bpf_constant_htons(ETH_P_IP), + .is_encap = true, + }, + .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP, + }, + }; static int create_tap(const char *ifname) -- Gitee From 9a798f6243f804793c6f717d99e90513f7d14fd8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 26 Jul 2019 14:24:38 -0700 Subject: [PATCH 10/26] libbpf: fix erroneous multi-closing of BTF FD ANBZ: #5530 commit 5d01ab7bac467edfc530e6ccf953921def935c62 upstream. Libbpf stores associated BTF FD per each instance of bpf_program. When program is unloaded, that FD is closed. This is wrong, because leads to a race and possibly closing of unrelated files, if application simultaneously opens new files while bpf_programs are unloaded. It's also unnecessary, because struct btf "owns" that FD, and btf__free(), called from bpf_object__close() will close it. Thus the fix is to never have per-program BTF FD and fetch it from obj->btf, when necessary. Fixes: 2993e0515bb4 ("tools/bpf: add support to read .BTF.ext sections") Reported-by: Andrey Ignatov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4d8ec9068c49..6e3ffc58c9cb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -184,7 +184,6 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; enum bpf_attach_type expected_attach_type; - int btf_fd; void *func_info; __u32 func_info_rec_size; __u32 func_info_cnt; @@ -315,7 +314,6 @@ void bpf_program__unload(struct bpf_program *prog) prog->instances.nr = -1; zfree(&prog->instances.fds); - zclose(prog->btf_fd); zfree(&prog->func_info); zfree(&prog->line_info); } @@ -394,7 +392,6 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx, prog->instances.fds = NULL; prog->instances.nr = -1; prog->type = BPF_PROG_TYPE_UNSPEC; - prog->btf_fd = -1; return 0; errout: @@ -2333,9 +2330,6 @@ bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj, prog->line_info_rec_size = btf_ext__line_info_rec_size(obj->btf_ext); } - if (!insn_offset) - prog->btf_fd = btf__fd(obj->btf); - return 0; } @@ -2508,7 +2502,7 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, char *cp, errmsg[STRERR_BUFSIZE]; int log_buf_size = BPF_LOG_BUF_SIZE; char *log_buf; - int ret; + int btf_fd, ret; if (!insns || !insns_cnt) return -EINVAL; @@ -2523,7 +2517,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.license = license; load_attr.kern_version = kern_version; load_attr.prog_ifindex = prog->prog_ifindex; - load_attr.prog_btf_fd = prog->btf_fd >= 0 ? prog->btf_fd : 0; + btf_fd = bpf_object__btf_fd(prog->obj); + load_attr.prog_btf_fd = btf_fd >= 0 ? btf_fd : 0; load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; load_attr.func_info_cnt = prog->func_info_cnt; -- Gitee From e799ffedc680ff5b4a5e0d479e4a7b61b6fbd23e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 1 Aug 2019 00:24:05 -0700 Subject: [PATCH 11/26] libbpf: set BTF FD for prog only when there is supported .BTF.ext data ANBZ: #5530 commit 3415ec643e7bd644b03026efbe2f2b36cbe9b34b upstream. 5d01ab7bac46 ("libbpf: fix erroneous multi-closing of BTF FD") introduced backwards-compatibility issue, manifesting itself as -E2BIG error returned on program load due to unknown non-zero btf_fd attribute value for BPF_PROG_LOAD sys_bpf() sub-command. This patch fixes bug by ensuring that we only ever associate BTF FD with program if there is a BTF.ext data that was successfully loaded into kernel, which automatically means kernel supports func_info/line_info and associated BTF FD for progs (checked and ensured also by BTF sanitization code). Fixes: 5d01ab7bac46 ("libbpf: fix erroneous multi-closing of BTF FD") Reported-by: Andrey Ignatov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6e3ffc58c9cb..5c136af2b01d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2517,7 +2517,11 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.license = license; load_attr.kern_version = kern_version; load_attr.prog_ifindex = prog->prog_ifindex; - btf_fd = bpf_object__btf_fd(prog->obj); + /* if .BTF.ext was loaded, kernel supports associated BTF for prog */ + if (prog->obj->btf_ext) + btf_fd = bpf_object__btf_fd(prog->obj); + else + btf_fd = -1; load_attr.prog_btf_fd = btf_fd >= 0 ? btf_fd : 0; load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; -- Gitee From 75fe9a78ec8c458f5d2e2966dbda67252bbf397b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 12/26] xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa upstream. Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- include/linux/list.h | 14 ++++++++++++++ kernel/bpf/xskmap.c | 3 +-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 79f126d0abf0..a809668a6da3 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) WRITE_ONCE(prev->next, next); } +/* + * Delete a list entry and clear the 'prev' pointer. + * + * This is a special-purpose list clearing method used in the networking code + * for lists allocated as per-cpu, where we don't want to incur the extra + * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this + * needs to check the node 'prev' pointer instead of calling list_empty(). + */ +static inline void __list_del_clearprev(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 0479017d3449..cdbb6f1dec66 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -197,8 +197,7 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } -- Gitee From 26b0f3cc11f11deee4aee2840c071096dbc8b4ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 13/26] devmap/cpumap: Use flush list instead of bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit d5df2830ca9922d03a33940ea424c9a5f39f1162 upstream. The socket map uses a linked list instead of a bitmap to keep track of which entries to flush. Do the same for devmap and cpumap, as this means we don't have to care about the map index when enqueueing things into the map (and so we can cache the map lookup). Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/cpumap.c | 105 ++++++++++++++++++++----------------------- kernel/bpf/devmap.c | 107 +++++++++++++++++++------------------------- net/core/filter.c | 2 - 3 files changed, 95 insertions(+), 119 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 809297152a17..2a98e429f447 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -331,7 +333,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -346,6 +349,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -402,7 +410,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -485,6 +493,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -511,14 +520,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -535,7 +544,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -587,9 +596,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -618,6 +627,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -628,10 +639,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -643,6 +655,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -662,41 +678,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d50907193c09..d9dac010a2bf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -25,9 +25,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -56,9 +55,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -73,23 +76,18 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err, cpu; u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -107,7 +105,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -116,28 +114,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -166,14 +166,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -189,7 +189,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -211,18 +211,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -249,6 +241,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -271,31 +264,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -322,10 +302,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -335,6 +316,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -385,17 +370,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -442,8 +422,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -466,6 +448,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/net/core/filter.c b/net/core/filter.c index 84ea1683e751..2b5ba884becf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3497,7 +3497,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3506,7 +3505,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { -- Gitee From 77add879587e2ad4628edbcb58b370660fcc217a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:35 +0200 Subject: [PATCH 14/26] devmap: Allow map lookups from eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 0cdbb4b09a0658b72c563638d476113aadd91afb upstream. We don't currently allow lookups into a devmap from eBPF, because the map lookup returns a pointer directly to the dev->ifindex, which shouldn't be modifiable from eBPF. However, being able to do lookups in devmaps is useful to know (e.g.) whether forwarding to a specific interface is enabled. Currently, programs work around this by keeping a shadow map of another type which indicates whether a map index is valid. Since we now have a flag to make maps read-only from the eBPF side, we can simply lift the lookup restriction if we make sure this flag is always set. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 5 +++++ kernel/bpf/verifier.c | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d9dac010a2bf..555875ebf56b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -97,6 +97,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 709c7a350a1d..c356d6d5a091 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3469,12 +3469,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases -- Gitee From 656373b1d7ccfb1ee8426110e7817b92cf45bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 Oct 2019 09:22:06 +0200 Subject: [PATCH 15/26] libbpf: Use implicit XSKMAP lookup from AF_XDP XDP program MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit d7d962a095474fcd94861d5f4cccada2e4215aae upstream. In commit 43e74c0267a3 ("bpf_xdp_redirect_map: Perform map lookup in eBPF helper") the bpf_redirect_map() helper learned to do map lookup, which means that the explicit lookup in the XDP program for AF_XDP is not needed for post-5.3 kernels. This commit adds the implicit map lookup with default action, which improves the performance for the "rx_drop" [1] scenario with ~4%. For pre-5.3 kernels, the bpf_redirect_map() returns XDP_ABORTED, and a fallback path for backward compatibility is entered, where explicit lookup is still performed. This means a slight regression for older kernels (an additional bpf_redirect_map() call), but I consider that a fair punishment for users not upgrading their kernels. ;-) v1->v2: Backward compatibility (Toke) [2] v2->v3: Avoid masking/zero-extension by using JMP32 [3] [1] # xdpsock -i eth0 -z -r [2] https://lore.kernel.org/bpf/87pnirb3dc.fsf@toke.dk/ [3] https://lore.kernel.org/bpf/87v9sip0i8.fsf@toke.dk/ Suggested-by: Toke Høiland-Jørgensen Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191022072206.6318-1-bjorn.topel@gmail.com Signed-off-by: Yuanhe Shu --- tools/lib/bpf/xsk.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index a2b002964267..1795b5c56b63 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -264,33 +264,55 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) /* This is the C-program: * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) * { - * int index = ctx->rx_queue_index; + * int ret, index = ctx->rx_queue_index; * * // A set entry here means that the correspnding queue_id * // has an active AF_XDP socket bound to it. + * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS); + * if (ret > 0) + * return ret; + * + * // Fallback for pre-5.3 kernels, not supporting default + * // action in the flags parameter. * if (bpf_map_lookup_elem(&xsks_map, &index)) * return bpf_redirect_map(&xsks_map, index, 0); - * * return XDP_PASS; * } */ struct bpf_insn prog[] = { - /* r1 = *(u32 *)(r1 + 16) */ - BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16), - /* *(u32 *)(r10 - 4) = r1 */ - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4), + /* r2 = *(u32 *)(r1 + 16) */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), + /* *(u32 *)(r10 - 4) = r2 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), + /* r3 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_3, 2), + /* call bpf_redirect_map */ + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + /* if w0 != 0 goto pc+13 */ + BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13), + /* r2 = r10 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* r2 += -4 */ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + /* r1 = xskmap[] */ BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), + /* call bpf_map_lookup_elem */ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + /* r1 = r0 */ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_MOV32_IMM(BPF_REG_0, 2), - /* if r1 == 0 goto +5 */ + /* r0 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_0, 2), + /* if r1 == 0 goto pc+5 */ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), /* r2 = *(u32 *)(r10 - 4) */ - BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), - BPF_MOV32_IMM(BPF_REG_3, 0), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), + /* r3 = 0 */ + BPF_MOV64_IMM(BPF_REG_3, 0), + /* call bpf_redirect_map */ BPF_EMIT_CALL(BPF_FUNC_redirect_map), /* The jumps are to this instruction */ BPF_EXIT_INSN(), -- Gitee From 8356826a6d3ab3d18eedeb0826cec6bce6a99bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:52 +0200 Subject: [PATCH 16/26] include/bpf.h: Remove map_insert_ctx() stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 6dbff13ca8a2ad2fddd904c2e789dd5e59a8644c upstream. When we changed the device and CPU maps to use linked lists instead of bitmaps, we also removed the need for the map_insert_ctx() helpers to keep track of the bitmaps inside each map. However, it seems I forgot to remove the function definitions stubs, so remove those here. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c10e1d0f482..88c9a1521b87 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,7 +727,6 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -735,7 +734,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); -void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -821,10 +819,6 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __dev_map_flush(struct bpf_map *map) { } @@ -854,10 +848,6 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } -static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __cpu_map_flush(struct bpf_map *map) { } -- Gitee From 3919b11ad8b1d628cb10b1c81c6a8218c4443d3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:53 +0200 Subject: [PATCH 17/26] xdp: Refactor devmap allocation code for reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit fca16e51078e8e5c0af839426b3d2dcd2bede135 upstream. The subsequent patch to add a new devmap sub-type can re-use much of the initialisation and allocation code, so refactor it into separate functions. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 136 +++++++++++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 53 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 555875ebf56b..cfc445b29247 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -68,9 +68,9 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; - unsigned int bit; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; + unsigned int idx; /* keep track of map index for tracepoint */ }; struct bpf_dtab { @@ -83,28 +83,21 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - struct bpf_dtab *dtab; int err, cpu; u64 cost; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); - if (!dtab) - return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&dtab->map, attr); @@ -115,9 +108,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) - goto free_dtab; - - err = -ENOMEM; + return -EINVAL; dtab->flush_list = alloc_percpu(struct list_head); if (!dtab->flush_list) @@ -132,19 +123,38 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; - spin_lock(&dev_map_lock); - list_add_tail_rcu(&dtab->list, &dev_map_list); - spin_unlock(&dev_map_lock); - - return &dtab->map; + return 0; free_percpu: free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); -free_dtab: - kfree(dtab); - return ERR_PTR(err); + return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + err = dev_map_init_map(dtab, attr); + if (err) { + kfree(dtab); + return ERR_PTR(err); + } + + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); + + return &dtab->map; } static void dev_map_free(struct bpf_map *map) @@ -243,7 +253,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); @@ -420,17 +430,52 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, + struct bpf_dtab *dtab, + u32 ifindex, + unsigned int idx) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct bpf_dtab_netdev *dev; + struct xdp_bulk_queue *bq; + int cpu; + + dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + free_percpu(dev->bulkq); + kfree(dev); + return ERR_PTR(-EINVAL); + } + + dev->idx = idx; + dev->dtab = dtab; + + return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; u32 ifindex = *(u32 *)value; - struct xdp_bulk_queue *bq; u32 i = *(u32 *)key; - int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -442,31 +487,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); - if (!dev) - return -ENOMEM; - - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return -ENOMEM; - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - free_percpu(dev->bulkq); - kfree(dev); - return -EINVAL; - } - - dev->bit = i; - dev->dtab = dtab; + dev = __dev_map_alloc_node(net, dtab, ifindex, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -480,6 +503,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, -- Gitee From 12d21b4efb42db911c4b45fdedee9deb3b150943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:55 +0200 Subject: [PATCH 18/26] xdp: Add devmap_hash map type for looking up devices by hashed index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 6f9d451ab1a33728adb72d7ff66a7b374d665176 upstream. A common pattern when using xdp_redirect_map() is to create a device map where the lookup key is simply ifindex. Because device maps are arrays, this leaves holes in the map, and the map has to be sized to fit the largest ifindex, regardless of how many devices actually are actually needed in the map. This patch adds a second type of device map where the key is looked up using a hashmap, instead of being used as an array index. This allows maps to be densely packed, so they can be smaller. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/linux/bpf.h | 7 ++ include/linux/bpf_types.h | 1 + include/trace/events/xdp.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 200 +++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 + net/core/filter.c | 9 +- 7 files changed, 220 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88c9a1521b87..998365350256 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,6 +727,7 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -819,6 +820,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + static inline void __dev_map_flush(struct bpf_map *map) { } diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index eec5aeeeaf92..36a9c2325176 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 31a081f58e4c..6179af019ac4 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -146,7 +146,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 36b37da0f02c..e7f524b1f059 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cfc445b29247..ad87873076da 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -45,6 +45,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -67,6 +73,7 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; @@ -78,11 +85,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -105,6 +131,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) @@ -123,8 +157,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + return 0; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -206,6 +250,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -226,6 +271,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -381,6 +490,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -430,6 +548,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -510,6 +650,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -520,6 +710,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c356d6d5a091..6f2b7f3b1a7b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3470,6 +3470,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3552,6 +3553,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; diff --git a/net/core/filter.c b/net/core/filter.c index 2b5ba884becf..ca1bd466b3c1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3491,7 +3491,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, int err; switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: { + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: { struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); @@ -3528,6 +3529,7 @@ void xdp_do_flush_map(void) if (map) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: __dev_map_flush(map); break; case BPF_MAP_TYPE_CPUMAP: @@ -3548,6 +3550,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_DEVMAP_HASH: + return __dev_map_hash_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: @@ -3647,7 +3651,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); -- Gitee From 788b5402c8874259d298a28dbd8692203b15bfa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:56 +0200 Subject: [PATCH 19/26] tools/include/uapi: Add devmap_hash BPF map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 10fbe21163fc6f05f5991d9cf9a2da9bdee4e834 upstream. This adds the devmap_hash BPF map type to the uapi headers in tools/. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79f88e532a71..178a40f8a298 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, }; /* Note that tracing related programs such as -- Gitee From 735e1892b0741f8478eca6494e13bbd483ec3618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:57 +0200 Subject: [PATCH 20/26] tools/libbpf_probes: Add new devmap_hash type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit e42346192c9f179a9a47845e4663ad2f453d2b7b upstream. This adds the definition for BPF_MAP_TYPE_DEVMAP_HASH to libbpf_probes.c in tools/lib/bpf. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Yuanhe Shu --- tools/lib/bpf/libbpf_probes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 236a37c8896c..84a0a0ff6671 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -246,6 +246,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_XSKMAP: -- Gitee From f883878576d8c88a3cd6581806bec3a37ab7da1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:58 +0200 Subject: [PATCH 21/26] tools: Add definitions for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 1375dc4a4579d5e767dd8c2d2abcd929ff59d0a7 upstream. This adds selftest and bpftool updates for the devmap_hash map type. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 4 ++-- tools/bpf/bpftool/map.c | 3 ++- tools/testing/selftests/bpf/test_maps.c | 16 ++++++++++++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 490b4501cb6e..61d1d270eb5e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -46,7 +46,7 @@ MAP COMMANDS | *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** | | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** } diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index c8f42e1fcbc9..6b961a5ed100 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -489,8 +489,8 @@ _bpftool() perf_event_array percpu_hash percpu_array \ stack_trace cgroup_array lru_hash \ lru_percpu_hash lpm_trie array_of_maps \ - hash_of_maps devmap sockmap cpumap xskmap \ - sockhash cgroup_storage reuseport_sockarray \ + hash_of_maps devmap devmap_hash sockmap cpumap \ + xskmap sockhash cgroup_storage reuseport_sockarray \ percpu_cgroup_storage queue stack' -- \ "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 68e49d75d06c..d0288e16e23f 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -37,6 +37,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", [BPF_MAP_TYPE_SOCKMAP] = "sockmap", [BPF_MAP_TYPE_CPUMAP] = "cpumap", [BPF_MAP_TYPE_XSKMAP] = "xskmap", @@ -1242,7 +1243,7 @@ static int do_help(int argc, char **argv) " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n" " percpu_array | stack_trace | cgroup_array | lru_hash |\n" " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" - " devmap | sockmap | cpumap | xskmap | sockhash |\n" + " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 8af7f18a1ff9..4ff1dbf949fb 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -527,6 +527,21 @@ static void test_devmap(int task, void *data) close(fd); } +static void test_devmap_hash(unsigned int task, void *data) +{ + int fd; + __u32 key, value; + + fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_HASH, sizeof(key), sizeof(value), + 2, 0); + if (fd < 0) { + printf("Failed to create devmap_hash '%s'!\n", strerror(errno)); + exit(1); + } + + close(fd); +} + static void test_queuemap(int task, void *data) { const int MAP_SIZE = 32; @@ -2303,6 +2318,7 @@ static void run_all_tests(void) test_arraymap_percpu_many_keys(); test_devmap(0, NULL); + test_devmap_hash(0, NULL); test_sockmap(0, NULL); test_map_large(); -- Gitee From f1c9fffeccba068a2b527e0a17a4edba00a520b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sun, 8 Sep 2019 09:20:16 +0100 Subject: [PATCH 22/26] xdp: Fix race in dev_map_hash_update_elem() when replacing element MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit af58e7ee6a8d83726ad8a2696e98d86400a7639c upstream. syzbot found a crash in dev_map_hash_update_elem(), when replacing an element with a new one. Jesper correctly identified the cause of the crash as a race condition between the initial lookup in the map (which is done before taking the lock), and the removal of the old element. Rather than just add a second lookup into the hashmap after taking the lock, fix this by reworking the function logic to take the lock before the initial lookup. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-and-tested-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ad87873076da..0d0d671379ab 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -658,19 +658,22 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, u32 ifindex = *(u32 *)value; u32 idx = *(u32 *)key; unsigned long flags; + int err = -EEXIST; if (unlikely(map_flags > BPF_EXIST || !ifindex)) return -EINVAL; + spin_lock_irqsave(&dtab->index_lock, flags); + old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) - return -EEXIST; + goto out_err; dev = __dev_map_alloc_node(net, dtab, ifindex, idx); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - spin_lock_irqsave(&dtab->index_lock, flags); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_err; + } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); @@ -691,6 +694,10 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; + +out_err: + spin_unlock_irqrestore(&dtab->index_lock, flags); + return err; } static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, -- Gitee From d63dfe63bc5a092f798a69d938287fd31d637df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 17 Oct 2019 12:57:02 +0200 Subject: [PATCH 23/26] xdp: Prevent overflow in devmap_hash cost calculation for 32-bit builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 05679ca6feebc1ef3bf743563315d9975adcf6fb upstream. Tetsuo pointed out that without an explicit cast, the cost calculation for devmap_hash type maps could overflow on 32-bit builds. This adds the missing cast. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191017105702.2807093-1-toke@redhat.com Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0d0d671379ab..59d61c27f033 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -136,7 +136,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += sizeof(struct hlist_head) * dtab->n_buckets; + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; } /* if map size is larger than memlock limit, reject it */ -- Gitee From f7dd76808bb74ea69a83aae7a81302fa1fd7d533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sat, 19 Oct 2019 13:19:31 +0200 Subject: [PATCH 24/26] xdp: Handle device unregister for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit ce197d83a9fc42795c248c90983bf05faf0f013b upstream. It seems I forgot to add handling of devmap_hash type maps to the device unregister hook for devmaps. This omission causes devices to not be properly released, which causes hangs. Fix this by adding the missing handler. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191019111931.2981954-1-toke@redhat.com Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 59d61c27f033..c07ea75599e6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -727,6 +727,32 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, }; +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, + struct net_device *netdev) +{ + unsigned long flags; + u32 i; + + spin_lock_irqsave(&dtab->index_lock, flags); + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; + + head = dev_map_index_hash(dtab, i); + + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + if (netdev != dev->dev) + continue; + + dtab->items--; + hlist_del_rcu(&dev->index_hlist); + call_rcu(&dev->rcu, __dev_map_entry_free); + } + } + spin_unlock_irqrestore(&dtab->index_lock, flags); +} + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { @@ -743,6 +769,11 @@ static int dev_map_notification(struct notifier_block *notifier, */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dev_map_hash_remove_netdev(dtab, netdev); + continue; + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; -- Gitee From 24834b92a54a656d91f83f4e2dd90cee9e797795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 21 Nov 2019 14:36:12 +0100 Subject: [PATCH 25/26] xdp: Fix cleanup on map free for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 071cdecec57fb5d5df78e6a12114ad7bccea5b0e upstream. Tetsuo pointed out that it was not only the device unregister hook that was broken for devmap_hash types, it was also cleanup on map free. So better fix this as well. While we're at it, there's no reason to allocate the netdev_map array for DEVMAP_HASH, so skip that and adjust the cost accordingly. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121133612.430414-1-toke@redhat.com Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c07ea75599e6..ad609166f501 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -82,7 +82,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head __percpu *flush_list; struct list_head list; @@ -109,6 +109,12 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries) return hash; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -128,8 +134,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += sizeof(struct list_head) * num_possible_cpus(); + cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -137,6 +142,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; + } else { + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } /* if map size is larger than memlock limit, reject it */ @@ -151,24 +158,22 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *), - dtab->map.numa_node); - if (!dtab->netdev_map) - goto free_percpu; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_map_area; + goto free_percpu; spin_lock_init(&dtab->index_lock); + } else { + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); + if (!dtab->netdev_map) + goto free_percpu; } return 0; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -236,21 +241,40 @@ static void dev_map_free(struct bpf_map *map) cond_resched(); } - for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; - dev = dtab->netdev_map[i]; - if (!dev) - continue; + head = dev_map_index_hash(dtab, i); - free_percpu(dev->bulkq); - dev_put(dev->dev); - kfree(dev); + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + hlist_del_rcu(&dev->index_hlist); + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + } + + kfree(dtab->dev_index_head); + } else { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + + bpf_map_area_free(dtab->netdev_map); } free_percpu(dtab->flush_list); - bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -271,12 +295,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); -- Gitee From 3ff00f9f33a8f8555cf742a7ca6113a60673bb89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 16 Jun 2020 16:28:29 +0200 Subject: [PATCH 26/26] devmap: Use bpf_map_area_alloc() for allocating hash buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 99c51064fb06146b3d494b745c947e438a10aaa7 upstream. Syzkaller discovered that creating a hash of type devmap_hash with a large number of entries can hit the memory allocator limit for allocating contiguous memory regions. There's really no reason to use kmalloc_array() directly in the devmap code, so just switch it to the existing bpf_map_area_alloc() function that is used elsewhere. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Xiumei Mu Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616142829.114173-1-toke@redhat.com Signed-off-by: Yuanhe Shu --- kernel/bpf/devmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ad609166f501..e9fca629601c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -96,12 +96,13 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -159,7 +160,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) goto free_percpu; @@ -257,7 +259,7 @@ static void dev_map_free(struct bpf_map *map) } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; -- Gitee