From 484b1c35a814ed664652ce945134c918f51558ab Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:48 -0400 Subject: [PATCH 01/25] bpf: in bpf_skb_adjust_room avoid copy in tx fast path ANBZ: #5530 commit 908adce6465394ea4a09c144507a40848e1d7db5 upstream. bpf_skb_adjust_room calls skb_cow on grow. This expensive operation can be avoided in the fast path when the only other clone has released the header. This is the common case for TCP, where one headerless clone is kept on the retransmit queue. It is safe to do so even when touching the gso fields in skb_shinfo. Regular tunnel encap with iptunnel_handle_offloads takes the same optimization. The tcp stack unclones in the unlikely case that it accesses these fields through headerless clones packets on the retransmit queue (see __tcp_retransmit_skb). If any other clones are present, e.g., from packet sockets, skb_cow_head returns the same value as skb_cow(). Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 47f85d50ef1b..2d50bae096c6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2977,7 +2977,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; - ret = skb_cow(skb, len_diff); + ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; -- Gitee From ef65eadb97d1e6eb2ee34d7184f73dd1bf25a389 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:49 -0400 Subject: [PATCH 02/25] selftests/bpf: bpf tunnel encap test ANBZ: #5530 commit 98cdabcd0798bd9991821493120b928ed0dfab73 upstream. Validate basic tunnel encapsulation using ipip. Set up two namespaces connected by veth. Connect a client and server. Do this with and without bpf encap. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/progs/test_tc_tunnel.c | 83 +++++++++++++++++++ tools/testing/selftests/bpf/test_tc_tunnel.sh | 75 +++++++++++++++++ 3 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_tc_tunnel.c create mode 100755 tools/testing/selftests/bpf/test_tc_tunnel.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index d648de9e2a85..d01e7801811a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -50,7 +50,8 @@ TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ test_skb_cgroup_id.sh \ test_flow_dissector.sh \ - test_tcp_check_syncookie.sh + test_tcp_check_syncookie.sh \ + test_tc_tunnel.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c new file mode 100644 index 000000000000..8223e4347be8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* In-place tunneling */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_endian.h" +#include "bpf_helpers.h" + +static const int cfg_port = 8000; + +static __always_inline void set_ipv4_csum(struct iphdr *iph) +{ + __u16 *iph16 = (__u16 *)iph; + __u32 csum; + int i; + + iph->check = 0; + +#pragma clang loop unroll(full) + for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++) + csum += *iph16++; + + iph->check = ~((csum & 0xffff) + (csum >> 16)); +} + +SEC("encap") +int encap_f(struct __sk_buff *skb) +{ + struct iphdr iph_outer, iph_inner; + struct tcphdr tcph; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + /* filter only packets we want */ + if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner), + &tcph, sizeof(tcph)) < 0) + return TC_ACT_OK; + + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + iph_outer = iph_inner; + iph_outer.protocol = IPPROTO_IPIP; + iph_outer.tot_len = bpf_htons(sizeof(iph_outer) + + bpf_htons(iph_outer.tot_len)); + set_ipv4_csum(&iph_outer); + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved header to start of room: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh new file mode 100755 index 000000000000..6ebb288a3afc --- /dev/null +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# In-place tunneling + +# must match the port that the bpf program filters on +readonly port=8000 + +readonly ns_prefix="ns-$$-" +readonly ns1="${ns_prefix}1" +readonly ns2="${ns_prefix}2" + +readonly ns1_v4=192.168.1.1 +readonly ns2_v4=192.168.1.2 + +setup() { + ip netns add "${ns1}" + ip netns add "${ns2}" + + ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ + peer name veth2 mtu 1500 netns "${ns2}" + + ip -netns "${ns1}" link set veth1 up + ip -netns "${ns2}" link set veth2 up + + ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 + ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 + + sleep 1 +} + +cleanup() { + ip netns del "${ns2}" + ip netns del "${ns1}" +} + +server_listen() { + ip netns exec "${ns2}" nc -l -p "${port}" & + sleep 0.2 +} + +client_connect() { + ip netns exec "${ns1}" nc -z -w 1 "${ns2_v4}" "${port}" + echo $? +} + +set -e +trap cleanup EXIT + +setup + +# basic communication works +echo "test basic connectivity" +server_listen +client_connect + +# clientside, insert bpf program to encap all TCP to port ${port} +# client can no longer connect +ip netns exec "${ns1}" tc qdisc add dev veth1 clsact +ip netns exec "${ns1}" tc filter add dev veth1 egress \ + bpf direct-action object-file ./test_tc_tunnel.o section encap +echo "test bpf encap without decap (expect failure)" +server_listen +! client_connect + +# serverside, insert decap module +# server is still running +# client can connect again +ip netns exec "${ns2}" ip link add dev testtun0 type ipip \ + remote "${ns1_v4}" local "${ns2_v4}" +ip netns exec "${ns2}" ip link set dev testtun0 up +echo "test bpf encap with tunnel device decap" +client_connect + +echo OK -- Gitee From 2205064cfe4955d3360e7a05c4cb3aad723af893 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:50 -0400 Subject: [PATCH 03/25] selftests/bpf: expand bpf tunnel test with decap ANBZ: #5530 commit ccd34cd3577dd6e244269bb8ccfab228360aa53d upstream. The bpf tunnel test encapsulates using bpf, then decapsulates using a standard tunnel device to verify correctness. Once encap is verified, also test decap, by replacing the tunnel device on decap with another bpf program. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../selftests/bpf/progs/test_tc_tunnel.c | 31 +++++++++++++++++++ tools/testing/selftests/bpf/test_tc_tunnel.sh | 9 ++++++ 2 files changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 8223e4347be8..25db148635ab 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -80,4 +80,35 @@ int encap_f(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("decap") +int decap_f(struct __sk_buff *skb) +{ + struct iphdr iph_outer, iph_inner; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, + sizeof(iph_outer)) < 0) + return TC_ACT_OK; + + if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + return TC_ACT_OK; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + if (bpf_skb_adjust_room(skb, -(int)sizeof(iph_outer), + BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved outer over inner header: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 6ebb288a3afc..91151d91e5a1 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -72,4 +72,13 @@ ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect +# serverside, use BPF for decap +ip netns exec "${ns2}" ip link del dev testtun0 +ip netns exec "${ns2}" tc qdisc add dev veth2 clsact +ip netns exec "${ns2}" tc filter add dev veth2 ingress \ + bpf direct-action object-file ./test_tc_tunnel.o section decap +server_listen +echo "test bpf encap with bpf decap" +client_connect + echo OK -- Gitee From c860a68fcd71f1ef47128a0124806d566770c9e8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:51 -0400 Subject: [PATCH 04/25] selftests/bpf: expand bpf tunnel test to ipv6 ANBZ: #5530 commit ef81bd054942e2bd8289c91a3528e6fc0ca26c1c upstream. The test only uses ipv4 so far, expand to ipv6. This is mostly a boilerplate near copy of the ipv4 path. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/config | 1 + .../selftests/bpf/progs/test_tc_tunnel.c | 116 +++++++++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 53 +++++++- 3 files changed, 148 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3655508f95fd..13825ebc9472 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,6 +13,7 @@ CONFIG_IPV6=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NET_IPGRE=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_TUNNEL=y CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_HMAC=m CONFIG_CRYPTO_SHA256=m diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 25db148635ab..591f540ce513 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -31,15 +32,11 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -SEC("encap") -int encap_f(struct __sk_buff *skb) +static int encap_ipv4(struct __sk_buff *skb) { struct iphdr iph_outer, iph_inner; struct tcphdr tcph; - if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) - return TC_ACT_OK; - if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) return TC_ACT_OK; @@ -80,35 +77,118 @@ int encap_f(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("decap") -int decap_f(struct __sk_buff *skb) +static int encap_ipv6(struct __sk_buff *skb) { - struct iphdr iph_outer, iph_inner; + struct ipv6hdr iph_outer, iph_inner; + struct tcphdr tcph; - if (skb->protocol != __bpf_constant_htons(ETH_P_IP)) + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) return TC_ACT_OK; - if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, - sizeof(iph_outer)) < 0) + /* filter only packets we want */ + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner), + &tcph, sizeof(tcph)) < 0) return TC_ACT_OK; - if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + iph_outer = iph_inner; + iph_outer.nexthdr = IPPROTO_IPV6; + iph_outer.payload_len = bpf_htons(sizeof(iph_outer) + + bpf_ntohs(iph_outer.payload_len)); + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* bpf_skb_adjust_room has moved header to start of room: restore */ + if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + &iph_inner, sizeof(iph_inner), + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +SEC("encap") +int encap_f(struct __sk_buff *skb) +{ + switch (skb->protocol) { + case __bpf_constant_htons(ETH_P_IP): + return encap_ipv4(skb); + case __bpf_constant_htons(ETH_P_IPV6): + return encap_ipv6(skb); + default: + /* does not match, ignore */ return TC_ACT_OK; + } +} - if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_outer), - &iph_inner, sizeof(iph_inner)) < 0) +static int decap_internal(struct __sk_buff *skb, int off, int len) +{ + char buf[sizeof(struct ipv6hdr)]; + + if (bpf_skb_load_bytes(skb, off + len, &buf, len) < 0) return TC_ACT_OK; - if (bpf_skb_adjust_room(skb, -(int)sizeof(iph_outer), - BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, -len, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved outer over inner header: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) + if (bpf_skb_store_bytes(skb, off, buf, len, BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; return TC_ACT_OK; } +static int decap_ipv4(struct __sk_buff *skb) +{ + struct iphdr iph_outer; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, + sizeof(iph_outer)) < 0) + return TC_ACT_OK; + + if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + return TC_ACT_OK; + + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); +} + +static int decap_ipv6(struct __sk_buff *skb) +{ + struct ipv6hdr iph_outer; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer, + sizeof(iph_outer)) < 0) + return TC_ACT_OK; + + if (iph_outer.nexthdr != IPPROTO_IPV6) + return TC_ACT_OK; + + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); +} + +SEC("decap") +int decap_f(struct __sk_buff *skb) +{ + switch (skb->protocol) { + case __bpf_constant_htons(ETH_P_IP): + return decap_ipv4(skb); + case __bpf_constant_htons(ETH_P_IPV6): + return decap_ipv6(skb); + default: + /* does not match, ignore */ + return TC_ACT_OK; + } +} + char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 91151d91e5a1..7b1758f3006b 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -12,6 +12,9 @@ readonly ns2="${ns_prefix}2" readonly ns1_v4=192.168.1.1 readonly ns2_v4=192.168.1.2 +readonly ns1_v6=fd::1 +readonly ns2_v6=fd::2 + setup() { ip netns add "${ns1}" @@ -25,6 +28,8 @@ setup() { ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 + ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad + ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad sleep 1 } @@ -35,16 +40,56 @@ cleanup() { } server_listen() { - ip netns exec "${ns2}" nc -l -p "${port}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" & sleep 0.2 } client_connect() { - ip netns exec "${ns1}" nc -z -w 1 "${ns2_v4}" "${port}" + ip netns exec "${ns1}" nc "${netcat_opt}" -z -w 1 "${addr2}" "${port}" echo $? } set -e + +# no arguments: automated test, run all +if [[ "$#" -eq "0" ]]; then + echo "ipip" + $0 ipv4 + + echo "ip6ip6" + $0 ipv6 + + echo "OK. All tests passed" + exit 0 +fi + +if [[ "$#" -ne "1" ]]; then + echo "Usage: $0" + echo " or: $0 " + exit 1 +fi + +case "$1" in +"ipv4") + readonly tuntype=ipip + readonly addr1="${ns1_v4}" + readonly addr2="${ns2_v4}" + readonly netcat_opt=-4 + ;; +"ipv6") + readonly tuntype=ip6tnl + readonly addr1="${ns1_v6}" + readonly addr2="${ns2_v6}" + readonly netcat_opt=-6 + ;; +*) + echo "unknown arg: $1" + exit 1 + ;; +esac + +echo "encap ${addr1} to ${addr2}, type ${tuntype}" + trap cleanup EXIT setup @@ -66,8 +111,8 @@ server_listen # serverside, insert decap module # server is still running # client can connect again -ip netns exec "${ns2}" ip link add dev testtun0 type ipip \ - remote "${ns1_v4}" local "${ns2_v4}" +ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ + remote "${addr1}" local "${addr2}" ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect -- Gitee From fa4a3e8174837338347cee67655adb08251154a4 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:52 -0400 Subject: [PATCH 05/25] selftests/bpf: extend bpf tunnel test with gre ANBZ: #5530 commit 7255fade7b93e7e84e12f27ae5e8af9cf8b93745 upstream. GRE is a commonly used protocol. Add GRE cases for both IPv4 and IPv6. It also inserts different sized headers, which can expose some unexpected edge cases. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../selftests/bpf/progs/test_tc_tunnel.c | 148 +++++++++++++----- tools/testing/selftests/bpf/test_tc_tunnel.sh | 21 ++- 2 files changed, 123 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 591f540ce513..900c5653105f 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -2,6 +2,9 @@ /* In-place tunneling */ +#include +#include + #include #include #include @@ -17,6 +20,18 @@ static const int cfg_port = 8000; +struct grev4hdr { + struct iphdr ip; + __be16 flags; + __be16 protocol; +} __attribute__((packed)); + +struct grev6hdr { + struct ipv6hdr ip; + __be16 flags; + __be16 protocol; +} __attribute__((packed)); + static __always_inline void set_ipv4_csum(struct iphdr *iph) { __u16 *iph16 = (__u16 *)iph; @@ -32,10 +47,12 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static int encap_ipv4(struct __sk_buff *skb) +static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) { - struct iphdr iph_outer, iph_inner; + struct grev4hdr h_outer; + struct iphdr iph_inner; struct tcphdr tcph; + int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) @@ -52,24 +69,33 @@ static int encap_ipv4(struct __sk_buff *skb) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ - iph_outer = iph_inner; - iph_outer.protocol = IPPROTO_IPIP; - iph_outer.tot_len = bpf_htons(sizeof(iph_outer) + - bpf_htons(iph_outer.tot_len)); - set_ipv4_csum(&iph_outer); + h_outer.ip = iph_inner; + h_outer.ip.tot_len = bpf_htons(olen + + bpf_htons(h_outer.ip.tot_len)); + if (with_gre) { + h_outer.ip.protocol = IPPROTO_GRE; + h_outer.protocol = bpf_htons(ETH_P_IP); + h_outer.flags = 0; + } else { + h_outer.ip.protocol = IPPROTO_IPIP; + } + + set_ipv4_csum((void *)&h_outer.ip); /* store new outer network header */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, &iph_inner, sizeof(iph_inner), BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; @@ -77,10 +103,12 @@ static int encap_ipv4(struct __sk_buff *skb) return TC_ACT_OK; } -static int encap_ipv6(struct __sk_buff *skb) +static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) { - struct ipv6hdr iph_outer, iph_inner; + struct ipv6hdr iph_inner; + struct grev6hdr h_outer; struct tcphdr tcph; + int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, sizeof(iph_inner)) < 0) @@ -94,23 +122,31 @@ static int encap_ipv6(struct __sk_buff *skb) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, sizeof(iph_outer), BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ - iph_outer = iph_inner; - iph_outer.nexthdr = IPPROTO_IPV6; - iph_outer.payload_len = bpf_htons(sizeof(iph_outer) + - bpf_ntohs(iph_outer.payload_len)); + h_outer.ip = iph_inner; + h_outer.ip.payload_len = bpf_htons(olen + + bpf_ntohs(h_outer.ip.payload_len)); + if (with_gre) { + h_outer.ip.nexthdr = IPPROTO_GRE; + h_outer.protocol = bpf_htons(ETH_P_IPV6); + h_outer.flags = 0; + } else { + h_outer.ip.nexthdr = IPPROTO_IPV6; + } /* store new outer network header */ - if (bpf_skb_store_bytes(skb, ETH_HLEN, &iph_outer, sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + sizeof(iph_outer), + if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, &iph_inner, sizeof(iph_inner), BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; @@ -118,28 +154,63 @@ static int encap_ipv6(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap") -int encap_f(struct __sk_buff *skb) +SEC("encap_ipip") +int __encap_ipip(struct __sk_buff *skb) { - switch (skb->protocol) { - case __bpf_constant_htons(ETH_P_IP): - return encap_ipv4(skb); - case __bpf_constant_htons(ETH_P_IPV6): - return encap_ipv6(skb); - default: - /* does not match, ignore */ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv4(skb, false); + else return TC_ACT_OK; - } } -static int decap_internal(struct __sk_buff *skb, int off, int len) +SEC("encap_gre") +int __encap_gre(struct __sk_buff *skb) { - char buf[sizeof(struct ipv6hdr)]; + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv4(skb, true); + else + return TC_ACT_OK; +} - if (bpf_skb_load_bytes(skb, off + len, &buf, len) < 0) +SEC("encap_ip6tnl") +int __encap_ip6tnl(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return encap_ipv6(skb, false); + else + return TC_ACT_OK; +} + +SEC("encap_ip6gre") +int __encap_ip6gre(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return encap_ipv6(skb, true); + else return TC_ACT_OK; +} - if (bpf_skb_adjust_room(skb, -len, BPF_ADJ_ROOM_NET, 0)) +static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) +{ + char buf[sizeof(struct grev6hdr)]; + int olen; + + switch (proto) { + case IPPROTO_IPIP: + case IPPROTO_IPV6: + olen = len; + break; + case IPPROTO_GRE: + olen = len + 4 /* gre hdr */; + break; + default: + return TC_ACT_OK; + } + + if (bpf_skb_load_bytes(skb, off + olen, &buf, olen) < 0) + return TC_ACT_OK; + + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_NET, 0)) return TC_ACT_SHOT; /* bpf_skb_adjust_room has moved outer over inner header: restore */ @@ -157,10 +228,11 @@ static int decap_ipv4(struct __sk_buff *skb) sizeof(iph_outer)) < 0) return TC_ACT_OK; - if (iph_outer.ihl != 5 || iph_outer.protocol != IPPROTO_IPIP) + if (iph_outer.ihl != 5) return TC_ACT_OK; - return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), + iph_outer.protocol); } static int decap_ipv6(struct __sk_buff *skb) @@ -171,10 +243,8 @@ static int decap_ipv6(struct __sk_buff *skb) sizeof(iph_outer)) < 0) return TC_ACT_OK; - if (iph_outer.nexthdr != IPPROTO_IPV6) - return TC_ACT_OK; - - return decap_internal(skb, ETH_HLEN, sizeof(iph_outer)); + return decap_internal(skb, ETH_HLEN, sizeof(iph_outer), + iph_outer.nexthdr); } SEC("decap") diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7b1758f3006b..c78922048610 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -54,30 +54,36 @@ set -e # no arguments: automated test, run all if [[ "$#" -eq "0" ]]; then echo "ipip" - $0 ipv4 + $0 ipv4 ipip echo "ip6ip6" - $0 ipv6 + $0 ipv6 ip6tnl + + echo "ip gre" + $0 ipv4 gre + + echo "ip6 gre" + $0 ipv6 ip6gre echo "OK. All tests passed" exit 0 fi -if [[ "$#" -ne "1" ]]; then +if [[ "$#" -ne "2" ]]; then echo "Usage: $0" - echo " or: $0 " + echo " or: $0 " exit 1 fi case "$1" in "ipv4") - readonly tuntype=ipip + readonly tuntype=$2 readonly addr1="${ns1_v4}" readonly addr2="${ns2_v4}" readonly netcat_opt=-4 ;; "ipv6") - readonly tuntype=ip6tnl + readonly tuntype=$2 readonly addr1="${ns1_v6}" readonly addr2="${ns2_v6}" readonly netcat_opt=-6 @@ -103,7 +109,8 @@ client_connect # client can no longer connect ip netns exec "${ns1}" tc qdisc add dev veth1 clsact ip netns exec "${ns1}" tc filter add dev veth1 egress \ - bpf direct-action object-file ./test_tc_tunnel.o section encap + bpf direct-action object-file ./test_tc_tunnel.o \ + section "encap_${tuntype}" echo "test bpf encap without decap (expect failure)" server_listen ! client_connect -- Gitee From 9752085817bbd693d972391f68b549c71e93827d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:53 -0400 Subject: [PATCH 06/25] selftests/bpf: extend bpf tunnel test with tso ANBZ: #5530 commit 8142958954d17a31e0ac9e3a9c91103a1c171179 upstream. Segmentation offload takes a longer path. Verify that the feature works with large packets. The test succeeds if not setting dodgy in bpf_skb_adjust_room, as veth TSO is permissive. If not setting SKB_GSO_DODGY, this enables tunneled TSO offload on supporting NICs. The feature sets SKB_GSO_DODGY because the caller is untrusted. As a result the packets traverse through the gso stack at least up to TCP. And fail the gso_type validation, such as the skb->encapsulation check in gre_gso_segment and the gso_type checks introduced in commit 418e897e0716 ("gso: validate gso_type on ipip style tunnel"). This will be addressed in a follow-on feature patch. In the meantime, disable the new gso tests. Changes v1->v2: - not all netcat versions support flag '-q', use timeout instead Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/test_tc_tunnel.sh | 60 +++++++++++++++---- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index c78922048610..9e18754f2354 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -15,6 +15,8 @@ readonly ns2_v4=192.168.1.2 readonly ns1_v6=fd::1 readonly ns2_v6=fd::2 +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" setup() { ip netns add "${ns1}" @@ -23,6 +25,8 @@ setup() { ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ peer name veth2 mtu 1500 netns "${ns2}" + ip netns exec "${ns1}" ethtool -K veth1 tso off + ip -netns "${ns1}" link set veth1 up ip -netns "${ns2}" link set veth2 up @@ -32,58 +36,86 @@ setup() { ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none } cleanup() { ip netns del "${ns2}" ip netns del "${ns1}" + + if [[ -f "${outfile}" ]]; then + rm "${outfile}" + fi + if [[ -f "${infile}" ]]; then + rm "${infile}" + fi } server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" & + server_pid=$! sleep 0.2 } client_connect() { - ip netns exec "${ns1}" nc "${netcat_opt}" -z -w 1 "${addr2}" "${port}" + ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}" echo $? } +verify_data() { + wait "${server_pid}" + # sha1sum returns two fields [sha1] [filepath] + # convert to bash array and access first elem + insum=($(sha1sum ${infile})) + outsum=($(sha1sum ${outfile})) + if [[ "${insum[0]}" != "${outsum[0]}" ]]; then + echo "data mismatch" + exit 1 + fi +} + set -e # no arguments: automated test, run all if [[ "$#" -eq "0" ]]; then echo "ipip" - $0 ipv4 ipip + $0 ipv4 ipip 100 echo "ip6ip6" - $0 ipv6 ip6tnl + $0 ipv6 ip6tnl 100 echo "ip gre" - $0 ipv4 gre + $0 ipv4 gre 100 echo "ip6 gre" - $0 ipv6 ip6gre + $0 ipv6 ip6gre 100 + + # disabled until passes SKB_GSO_DODGY checks + # echo "ip gre gso" + # $0 ipv4 gre 2000 + + # disabled until passes SKB_GSO_DODGY checks + # echo "ip6 gre gso" + # $0 ipv6 ip6gre 2000 echo "OK. All tests passed" exit 0 fi -if [[ "$#" -ne "2" ]]; then +if [[ "$#" -ne "3" ]]; then echo "Usage: $0" - echo " or: $0 " + echo " or: $0 " exit 1 fi case "$1" in "ipv4") - readonly tuntype=$2 readonly addr1="${ns1_v4}" readonly addr2="${ns2_v4}" readonly netcat_opt=-4 ;; "ipv6") - readonly tuntype=$2 readonly addr1="${ns1_v6}" readonly addr2="${ns2_v6}" readonly netcat_opt=-6 @@ -94,7 +126,10 @@ case "$1" in ;; esac -echo "encap ${addr1} to ${addr2}, type ${tuntype}" +readonly tuntype=$2 +readonly datalen=$3 + +echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}" trap cleanup EXIT @@ -104,6 +139,7 @@ setup echo "test basic connectivity" server_listen client_connect +verify_data # clientside, insert bpf program to encap all TCP to port ${port} # client can no longer connect @@ -123,6 +159,7 @@ ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect +verify_data # serverside, use BPF for decap ip netns exec "${ns2}" ip link del dev testtun0 @@ -132,5 +169,6 @@ ip netns exec "${ns2}" tc filter add dev veth2 ingress \ server_listen echo "test bpf encap with bpf decap" client_connect +verify_data echo OK -- Gitee From ba81938fb3fe822bd9792a2b536378c57ad4b1de Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:54 -0400 Subject: [PATCH 07/25] bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC ANBZ: #5530 commit 14aa31929b724b70fb63a9b0e7877da325b25cfe upstream. bpf_skb_adjust_room net allows inserting room in an skb. Existing mode BPF_ADJ_ROOM_NET inserts room after the network header by pulling the skb, moving the network header forward and zeroing the new space. Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac header. This allows inserting tunnel headers in front of the network header without having to recreate the network header in the original space, avoiding two copies. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/uapi/linux/bpf.h | 6 +++++- net/core/filter.c | 38 ++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19f3fcf445d2..f1d889f70348 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1482,7 +1482,10 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). @@ -2746,6 +2749,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/net/core/filter.c b/net/core/filter.c index 2d50bae096c6..3f0c967e7a5d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2969,9 +2969,8 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -2998,9 +2997,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -3033,7 +3031,8 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } -static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); @@ -3041,14 +3040,28 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) u32 len_max = __bpf_skb_max_len(skb); __be16 proto = skb->protocol; bool shrink = len_diff < 0; + u32 off; int ret; + if (unlikely(flags)) + return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + len_cur = skb->len - skb_network_offset(skb); if (skb_transport_header_was_set(skb) && !trans_same) len_cur = skb_network_header_len(skb); @@ -3058,24 +3071,13 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : - bpf_skb_net_grow(skb, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : + bpf_skb_net_grow(skb, off, len_diff_abs); bpf_compute_data_pointers(skb); return ret; } -BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, - u32, mode, u64, flags) -{ - if (unlikely(flags)) - return -EINVAL; - if (likely(mode == BPF_ADJ_ROOM_NET)) - return bpf_skb_adjust_net(skb, len_diff); - - return -ENOTSUPP; -} - static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, -- Gitee From 63d20615f3af9ecf6410acf0dc7aaa56bf7aaf01 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:55 -0400 Subject: [PATCH 08/25] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO ANBZ: #5530 commit 2278f6cc151a8bef6ba0b3fe3009d14dc3c51c4a upstream. bpf_skb_adjust_room adjusts gso_size of gso packets to account for the pushed or popped header room. This is not allowed with UDP, where gso_size delineates datagrams. Add an option to avoid these updates and allow this call for datagrams. It can also be used with TCP, when MSS is known to allow headroom, e.g., through MSS clamping or route MTU. Changes v1->v2: - document flag BPF_F_ADJ_ROOM_FIXED_GSO - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change. Link: https://patchwork.ozlabs.org/patch/1052497/ Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/uapi/linux/bpf.h | 9 +++++++-- net/core/filter.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1d889f70348..e5c29c8eaee4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1490,8 +1490,10 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * There is one supported flag at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2743,6 +2745,9 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) diff --git a/net/core/filter.c b/net/core/filter.c index 3f0c967e7a5d..2cb5aa338028 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2969,12 +2969,19 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) @@ -2988,7 +2995,9 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -2997,12 +3006,17 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) @@ -3016,7 +3030,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3043,7 +3059,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags)) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3071,8 +3087,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : - bpf_skb_net_grow(skb, off, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); bpf_compute_data_pointers(skb); return ret; -- Gitee From ad653aaf8c65cd28c2782e38a470b847bfd646c2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:56 -0400 Subject: [PATCH 09/25] bpf: add bpf_skb_adjust_room encap flags ANBZ: #5530 commit 868d523535c2d00b696753ece606e641a816e91e upstream. When pushing tunnel headers, annotate skbs in the same way as tunnel devices. For GSO packets, the network stack requires certain fields set to segment packets with tunnel headers. gro_gse_segment depends on transport and inner mac header, for instance. Add an option to pass this information. Remove the restriction on len_diff to network header length, which is too short, e.g., for GRE protocols. Changes v1->v2: - document new flags - BPF_F_ADJ_ROOM_MASK moved v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- include/uapi/linux/bpf.h | 16 +++++++++- net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e5c29c8eaee4..ea1066927b6c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1490,11 +1490,20 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * There is one supported flag at this time: + * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2751,6 +2760,11 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index 2cb5aa338028..45068c9f00d9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2969,11 +2969,20 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + unsigned int gso_type = SKB_GSO_DODGY; + u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -2987,10 +2996,60 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + inner_trans = skb->transport_header; + } + ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + /* inner mac == inner_net on l3 encap */ + skb->inner_mac_header = inner_net; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + } + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2999,7 +3058,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; } @@ -3050,7 +3109,6 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { - bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = __bpf_skb_max_len(skb); @@ -3079,8 +3137,6 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, } len_cur = skb->len - skb_network_offset(skb); - if (skb_transport_header_was_set(skb) && !trans_same) - len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && -- Gitee From 16822846a943d44664a7e4bb6fedc715a8540ede Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:57 -0400 Subject: [PATCH 10/25] bpf: Sync bpf.h to tools ANBZ: #5530 commit 6c408decbdc8a12a12a93c4d763cbc2a264f9332 upstream. Sync include/uapi/linux/bpf.h with tools/ Changes v1->v2: - BPF_F_ADJ_ROOM_MASK moved, no longer in this commit v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved, no longer in this commit Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19845a3f2da0..361a22796976 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1480,13 +1480,27 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2735,12 +2749,21 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ -- Gitee From bd3faf64864392f9c7415cb36c5ecb992a5186c7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:58 -0400 Subject: [PATCH 11/25] selftests/bpf: convert bpf tunnel test to BPF_ADJ_ROOM_MAC ANBZ: #5530 commit 005edd16562b78f416e2f576a64789c90d96882f upstream. Avoid moving the network layer header when prefixing tunnel headers. This avoids an explicit call to bpf_skb_store_bytes and an implicit move of the network header bytes in bpf_skb_adjust_room. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../selftests/bpf/progs/test_tc_tunnel.c | 25 +++---------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 900c5653105f..f6a16fd23dbd 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -72,7 +72,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -94,12 +94,6 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; - /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, - &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) - return TC_ACT_SHOT; - return TC_ACT_OK; } @@ -125,7 +119,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_NET, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -145,12 +139,6 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) BPF_F_INVALIDATE_HASH) < 0) return TC_ACT_SHOT; - /* bpf_skb_adjust_room has moved header to start of room: restore */ - if (bpf_skb_store_bytes(skb, ETH_HLEN + olen, - &iph_inner, sizeof(iph_inner), - BPF_F_INVALIDATE_HASH) < 0) - return TC_ACT_SHOT; - return TC_ACT_OK; } @@ -207,14 +195,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_load_bytes(skb, off + olen, &buf, olen) < 0) - return TC_ACT_OK; - - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_NET, 0)) - return TC_ACT_SHOT; - - /* bpf_skb_adjust_room has moved outer over inner header: restore */ - if (bpf_skb_store_bytes(skb, off, buf, len, BPF_F_INVALIDATE_HASH) < 0) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, 0)) return TC_ACT_SHOT; return TC_ACT_OK; -- Gitee From 7aa9fa7eb518b839270f7d657231df2e672e9007 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:59 -0400 Subject: [PATCH 12/25] selftests/bpf: convert bpf tunnel test to BPF_F_ADJ_ROOM_FIXED_GSO ANBZ: #5530 commit 94f16813e1b297d31f8fe6217cd9be19e080f998 upstream. Lower route MTU to ensure packets fit in device MTU after encap, then skip the gso_size changes. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 11 ++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 6 ++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index f6a16fd23dbd..3b79dffb8103 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -52,6 +52,7 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) struct grev4hdr h_outer; struct iphdr iph_inner; struct tcphdr tcph; + __u64 flags; int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, @@ -69,10 +70,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + flags = BPF_F_ADJ_ROOM_FIXED_GSO; olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -102,6 +104,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) struct ipv6hdr iph_inner; struct grev6hdr h_outer; struct tcphdr tcph; + __u64 flags; int olen; if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, @@ -116,10 +119,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; + flags = BPF_F_ADJ_ROOM_FIXED_GSO; olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); /* add room between mac and network header */ - if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; /* prepare new outer network header */ @@ -195,7 +199,8 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, 0)) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO)) return TC_ACT_SHOT; return TC_ACT_OK; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 9e18754f2354..cda5317790d2 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -35,6 +35,12 @@ setup() { ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad + # clamp route to reserve room for tunnel headers + ip -netns "${ns1}" -4 route flush table main + ip -netns "${ns1}" -6 route flush table main + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1 + sleep 1 dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -- Gitee From 436137ba8015f41aa93ddaebd19fc9c5efdbb77c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:33:00 -0400 Subject: [PATCH 13/25] selftests/bpf: convert bpf tunnel test to encap modes ANBZ: #5530 commit 75a1a9fa2e20de6319a19161ce4e2e1817d70e28 upstream. Make the tests correctly annotate skbs with tunnel metadata. This makes the gso tests succeed. Enable them. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- .../selftests/bpf/progs/test_tc_tunnel.c | 19 +++++++++++++++---- tools/testing/selftests/bpf/test_tc_tunnel.sh | 10 ++++------ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 3b79dffb8103..f541c2de947d 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -70,8 +70,13 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; - flags = BPF_F_ADJ_ROOM_FIXED_GSO; - olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4; + if (with_gre) { + flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; + olen = sizeof(h_outer); + } else { + olen = sizeof(h_outer.ip); + } /* add room between mac and network header */ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) @@ -119,8 +124,14 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) if (tcph.dest != __bpf_constant_htons(cfg_port)) return TC_ACT_OK; - flags = BPF_F_ADJ_ROOM_FIXED_GSO; - olen = with_gre ? sizeof(h_outer) : sizeof(h_outer.ip); + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6; + if (with_gre) { + flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; + olen = sizeof(h_outer); + } else { + olen = sizeof(h_outer.ip); + } + /* add room between mac and network header */ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index cda5317790d2..dcf320626931 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -97,13 +97,11 @@ if [[ "$#" -eq "0" ]]; then echo "ip6 gre" $0 ipv6 ip6gre 100 - # disabled until passes SKB_GSO_DODGY checks - # echo "ip gre gso" - # $0 ipv4 gre 2000 + echo "ip gre gso" + $0 ipv4 gre 2000 - # disabled until passes SKB_GSO_DODGY checks - # echo "ip6 gre gso" - # $0 ipv6 ip6gre 2000 + echo "ip6 gre gso" + $0 ipv6 ip6gre 2000 echo "OK. All tests passed" exit 0 -- Gitee From b99d85577ab1b7ae74986ce6bcd6a28b306f84ad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 23 Mar 2019 12:23:07 -0400 Subject: [PATCH 14/25] bpf: silence uninitialized var warning in bpf_skb_net_grow ANBZ: #5530 commit 62b31b42cff924c7d1e9a095b68ff3bbfc49b15b upstream. These three variables are set in one branch and used in another with the same condition. But on some architectures they still generate compiler warnings of the kind: warning: 'inner_trans' may be used uninitialized in this function [-Wmaybe-uninitialized] Silence these false positives. Use the straightforward approach to always initialize them, if a bit superfluous. Fixes: 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") Reported-by: kbuild test robot Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Yuanhe Shu --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 45068c9f00d9..e2c33681f7c0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2981,8 +2981,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; - u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { -- Gitee From 101d76667e05ea26209400fa81408f7a67d4a0a6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 25 Mar 2019 09:36:37 +0000 Subject: [PATCH 15/25] bpf: test_tc_tunnel.sh needs reverse path filtering disabled ANBZ: #5530 commit 0c4ea7f87abbdb56df616678bc23f10e51a0b4f8 upstream. test_tc_tunnel.sh sets up a pair of namespaces connected by a veth pair to verify encap/decap using bpf_skb_adjust_room. In testing this, it uses tunnel links as the peer of the bpf-based encap/decap. However because the same IP header is used for inner and outer IP, when packets arrive at the tunnel interface they will be dropped by reverse path filtering as those packets are expected on the veth interface (where the destination IP of the decapped packet is configured). To avoid this, ensure reverse path filtering is disabled for the namespace using tunneling. Fixes: 98cdabcd0798 ("selftests/bpf: bpf tunnel encap test") Signed-off-by: Alan Maguire Acked-by: Willem de Bruijn Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/test_tc_tunnel.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index dcf320626931..c805adb88f3a 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -160,6 +160,14 @@ server_listen # client can connect again ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ remote "${addr1}" local "${addr2}" +# Because packets are decapped by the tunnel they arrive on testtun0 from +# the IP stack perspective. Ensure reverse path filtering is disabled +# otherwise we drop the TCP SYN as arriving on testtun0 instead of the +# expected veth2 (veth2 is where 192.168.1.2 is configured). +ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 +# rp needs to be disabled for both all and testtun0 as the rp value is +# selected as the max of the "all" and device-specific values. +ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0 ip netns exec "${ns2}" ip link set dev testtun0 up echo "test bpf encap with tunnel device decap" client_connect -- Gitee From 9a37d249964da1da8b4564d7f1548fd0f5b553bf Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 12 Apr 2019 09:30:48 -0400 Subject: [PATCH 16/25] bpf: reserve flags in bpf_skb_net_shrink ANBZ: #5530 commit 43537b8e2dc515e037e855504db3f6c7cf73c79f upstream. The ENCAP flags in bpf_skb_adjust_room are ignored on decap with bpf_skb_net_shrink. Reserve these bits for future use. Fixes: 868d523535c2d ("bpf: add bpf_skb_adjust_room encap flags") Signed-off-by: Willem de Bruijn Reviewed-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index e2c33681f7c0..c630f917d311 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3070,6 +3070,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; + if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + return -EINVAL; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || -- Gitee From 84ed42ae3deebf18d3745d78a5b44740ca22bd4e Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 13 Jul 2021 12:27:19 +0200 Subject: [PATCH 17/25] selftests/bpf: Remove unused variable in tc_tunnel prog ANBZ: #5530 commit de587d564f957b685e47da1848d428b86173766d upstream. The variable buf is unused since commit 005edd16562b ("selftests/bpf: convert bpf tunnel test to BPF_ADJ_ROOM_MAC"). Remove it to fix the following warning: test_tc_tunnel.c:531:7: warning: unused variable 'buf' [-Wunused-variable] Fixes: 005edd16562b ("selftests/bpf: convert bpf tunnel test to BPF_ADJ_ROOM_MAC") Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210713102719.8890-1-tklauser@distanz.ch Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index f541c2de947d..4cbbab2de5e6 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -195,7 +195,6 @@ int __encap_ip6gre(struct __sk_buff *skb) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { - char buf[sizeof(struct grev6hdr)]; int olen; switch (proto) { -- Gitee From db1543b2f175709ba3c6bbb20c028e9972d61f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 27 Mar 2019 14:51:14 +0100 Subject: [PATCH 18/25] libbpf: add libelf dependency to shared library build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 89dedaef49d36adc2bb5e7e4c38b52fa3013c7c8 upstream. The DPDK project is moving forward with its AF_XDP PMD, and during that process some libbpf issues surfaced [1]: When libbpf was built as a shared library, libelf was not included in the linking phase. Since libelf is an internal depedency to libbpf, libelf should be included. This patch adds '-lelf' to resolve that. [1] https://patches.dpdk.org/patch/50704/#93571 Fixes: 1b76c13e4b36 ("bpf tools: Introduce 'bpf' library and add bpf feature check") Suggested-by: Luca Boccassi Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 9965dff66410..6c8f1d6f7dcc 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -177,7 +177,7 @@ $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN) $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \ - -Wl,--version-script=$(VERSION_SCRIPT) $^ -o $@ + -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@ @ln -sf $(@F) $(OUTPUT)libbpf.so @ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION) -- Gitee From 17530e5f445963302f22fb54195bf57597bc8268 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 28 Mar 2019 11:33:53 +0000 Subject: [PATCH 19/25] tools/bpf: generate pkg-config file for libbpf ANBZ: #5530 commit dd399ac9e343c7573c47d6820e4a23013c54749d upstream. Generate a libbpf.pc file at build time so that users can rely on pkg-config to find the library, its CFLAGS and LDFLAGS. Signed-off-by: Luca Boccassi Acked-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/lib/bpf/.gitignore | 1 + tools/lib/bpf/Makefile | 18 +++++++++++++++--- tools/lib/bpf/libbpf.pc.template | 12 ++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 tools/lib/bpf/libbpf.pc.template diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore index c7ae37ea488d..82b8f7d41531 100644 --- a/tools/lib/bpf/.gitignore +++ b/tools/lib/bpf/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only libbpf_version.h +libbpf.pc FEATURE-DUMP.libbpf test_libbpf diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 6c8f1d6f7dcc..f3f8218aa9fd 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -90,6 +90,7 @@ LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION) LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION) LIB_FILE = libbpf.a libbpf.so* +PC_FILE = libbpf.pc # Set compile option CFLAGS ifdef EXTRA_CFLAGS @@ -134,13 +135,14 @@ VERSION_SCRIPT := libbpf.map LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) +PC_FILE := $(addprefix $(OUTPUT),$(PC_FILE)) GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}') VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) -CMD_TARGETS = $(LIB_TARGET) +CMD_TARGETS = $(LIB_TARGET) $(PC_FILE) CXX_TEST_TARGET = $(OUTPUT)test_libbpf @@ -187,6 +189,12 @@ $(OUTPUT)libbpf.a: $(BPF_IN) $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a $(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@ +$(OUTPUT)libbpf.pc: + $(QUIET_GEN)sed -e "s|@PREFIX@|$(prefix)|" \ + -e "s|@LIBDIR@|$(libdir_SQ)|" \ + -e "s|@VERSION@|$(LIBBPF_VERSION)|" \ + < libbpf.pc.template > $@ + check: check_abi check_abi: $(OUTPUT)libbpf.so @@ -224,7 +232,11 @@ install_headers: $(call do_install,btf.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); -install: install_lib +install_pkgconfig: $(PC_FILE) + $(call QUIET_INSTALL, $(PC_FILE)) \ + $(call do_install,$(PC_FILE),$(libdir_SQ)/pkgconfig,644) + +install: install_lib install_pkgconfig ### Cleaning rules @@ -234,7 +246,7 @@ config-clean: clean: $(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \ - *.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd LIBBPF-CFLAGS + *.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd *.pc LIBBPF-CFLAGS $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf diff --git a/tools/lib/bpf/libbpf.pc.template b/tools/lib/bpf/libbpf.pc.template new file mode 100644 index 000000000000..ac17fcef2108 --- /dev/null +++ b/tools/lib/bpf/libbpf.pc.template @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +prefix=@PREFIX@ +libdir=@LIBDIR@ +includedir=${prefix}/include + +Name: libbpf +Description: BPF library +Version: @VERSION@ +Libs: -L${libdir} -lbpf +Requires.private: libelf +Cflags: -I${includedir} -- Gitee From 48fe45ed2feb93289e7d3017e6abacc9807150cf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:30 -0700 Subject: [PATCH 20/25] selftests/bpf: fix vlan handling in flow dissector program ANBZ: #5530 commit 2c3af7d901c61c101c02f431cfb520af9ff56ab4 upstream. When we tail call PROG(VLAN) from parse_eth_proto we don't need to peek back to handle vlan proto because we didn't adjust nhoff/thoff yet. Use flow_keys->n_proto, that we set in parse_eth_proto instead and properly increment nhoff as well. Also, always use skb->protocol and don't look at skb->vlan_present. skb->vlan_present indicates that vlan information is stored out-of-band in skb->vlan_{tci,proto} and vlan header is already pulled from skb. That means, skb->vlan_present == true is not relevant for BPF flow dissector. Add simple test cases with VLAN tagged frames: * single vlan for ipv4 * double vlan for ipv6 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- tools/testing/selftests/bpf/progs/bpf_flow.c | 15 ++--- tools/testing/selftests/bpf/test_progs.c | 68 ++++++++++++++++++++ 2 files changed, 72 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 284660f5aa95..f177c7a6a6c7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -119,10 +119,7 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) SEC("flow_dissector") int _dissect(struct __sk_buff *skb) { - if (!skb->vlan_present) - return parse_eth_proto(skb, skb->protocol); - else - return parse_eth_proto(skb, skb->vlan_proto); + return parse_eth_proto(skb, skb->protocol); } /* Parses on IPPROTO_* */ @@ -336,15 +333,9 @@ PROG(VLAN)(struct __sk_buff *skb) { struct bpf_flow_keys *keys = skb->flow_keys; struct vlan_hdr *vlan, _vlan; - __be16 proto; - - /* Peek back to see if single or double-tagging */ - if (bpf_skb_load_bytes(skb, keys->thoff - sizeof(proto), &proto, - sizeof(proto))) - return BPF_DROP; /* Account for double-tagging */ - if (proto == bpf_htons(ETH_P_8021AD)) { + if (keys->n_proto == bpf_htons(ETH_P_8021AD)) { vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); if (!vlan) return BPF_DROP; @@ -352,6 +343,7 @@ PROG(VLAN)(struct __sk_buff *skb) if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q)) return BPF_DROP; + keys->nhoff += sizeof(*vlan); keys->thoff += sizeof(*vlan); } @@ -359,6 +351,7 @@ PROG(VLAN)(struct __sk_buff *skb) if (!vlan) return BPF_DROP; + keys->nhoff += sizeof(*vlan); keys->thoff += sizeof(*vlan); /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/ if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) || diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 26a09347c062..b5a8706a1f36 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1965,6 +1965,58 @@ static struct bpf_flow_keys pkt_v6_flow_keys = { .n_proto = __bpf_constant_htons(ETH_P_IPV6), }; +#define VLAN_HLEN 4 + +static struct { + struct ethhdr eth; + __u16 vlan_tci; + __u16 vlan_proto; + struct iphdr iph; + struct tcphdr tcp; +} __packed pkt_vlan_v4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_8021Q), + .vlan_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +static struct bpf_flow_keys pkt_vlan_v4_flow_keys = { + .nhoff = VLAN_HLEN, + .thoff = VLAN_HLEN + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IP), +}; + +static struct { + struct ethhdr eth; + __u16 vlan_tci; + __u16 vlan_proto; + __u16 vlan_tci2; + __u16 vlan_proto2; + struct ipv6hdr iph; + struct tcphdr tcp; +} __packed pkt_vlan_v6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_8021AD), + .vlan_proto = __bpf_constant_htons(ETH_P_8021Q), + .vlan_proto2 = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +static struct bpf_flow_keys pkt_vlan_v6_flow_keys = { + .nhoff = VLAN_HLEN * 2, + .thoff = VLAN_HLEN * 2 + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), +}; + static void test_flow_dissector(void) { struct bpf_flow_keys flow_keys; @@ -1994,6 +2046,22 @@ static void test_flow_dissector(void) err, errno, retval, duration, size, sizeof(flow_keys)); CHECK_FLOW_KEYS("ipv6_flow_keys", flow_keys, pkt_v6_flow_keys); + err = bpf_prog_test_run(prog_fd, 10, &pkt_vlan_v4, sizeof(pkt_vlan_v4), + &flow_keys, &size, &retval, &duration); + CHECK(size != sizeof(flow_keys) || err || retval != 1, "vlan_ipv4", + "err %d errno %d retval %d duration %d size %u/%lu\n", + err, errno, retval, duration, size, sizeof(flow_keys)); + CHECK_FLOW_KEYS("vlan_ipv4_flow_keys", flow_keys, + pkt_vlan_v4_flow_keys); + + err = bpf_prog_test_run(prog_fd, 10, &pkt_vlan_v6, sizeof(pkt_vlan_v6), + &flow_keys, &size, &retval, &duration); + CHECK(size != sizeof(flow_keys) || err || retval != 1, "vlan_ipv6", + "err %d errno %d retval %d duration %d size %u/%lu\n", + err, errno, retval, duration, size, sizeof(flow_keys)); + CHECK_FLOW_KEYS("vlan_ipv6_flow_keys", flow_keys, + pkt_vlan_v6_flow_keys); + bpf_object__close(obj); } -- Gitee From 4fb643ab733fa57ff45ca6633b7b1d612742717c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:31 -0700 Subject: [PATCH 21/25] net/flow_dissector: pass flow_keys->n_proto to BPF programs ANBZ: #5530 commit 822fe61795018265ae14731d4e5399e5bde36864 upstream. This is a preparation for the next commit that would prohibit access to the most fields of __sk_buff from the BPF programs. Instead of requiring BPF flow dissector programs to look into skb, pass all input data in the flow_keys. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- net/core/flow_dissector.c | 1 + tools/testing/selftests/bpf/progs/bpf_flow.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 64689a928a72..0d924b39a9f8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -707,6 +707,7 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->n_proto = skb->protocol; flow_keys->nhoff = skb_network_offset(skb); flow_keys->thoff = flow_keys->nhoff; diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index f177c7a6a6c7..75b17cada539 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -92,7 +92,6 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) { struct bpf_flow_keys *keys = skb->flow_keys; - keys->n_proto = proto; switch (proto) { case bpf_htons(ETH_P_IP): bpf_tail_call(skb, &jmp_table, IP); @@ -119,7 +118,9 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) SEC("flow_dissector") int _dissect(struct __sk_buff *skb) { - return parse_eth_proto(skb, skb->protocol); + struct bpf_flow_keys *keys = skb->flow_keys; + + return parse_eth_proto(skb, keys->n_proto); } /* Parses on IPPROTO_* */ @@ -358,6 +359,7 @@ PROG(VLAN)(struct __sk_buff *skb) vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q)) return BPF_DROP; + keys->n_proto = vlan->h_vlan_encapsulated_proto; return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto); } -- Gitee From da83ac7d4e5178263ab1cb621e92942699858125 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:32 -0700 Subject: [PATCH 22/25] flow_dissector: fix clamping of BPF flow_keys for non-zero nhoff ANBZ: #5530 commit b9e9c8599f0f23e3d2051befc9966a84b639f64f upstream. Don't allow BPF program to set flow_keys->nhoff to less than initial value. We currently don't read the value afterwards in anything but the tests, but it's still a good practice to return consistent values to the test programs. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- net/core/flow_dissector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0d924b39a9f8..f8a08701d3f5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -717,7 +717,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, /* Restore state */ memcpy(cb, &cb_saved, sizeof(cb_saved)); - flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, + skb_network_offset(skb), skb->len); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, skb->len); -- Gitee From 4d11a932fbb488a0970ec11a788d70459f3e3ce8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:33 -0700 Subject: [PATCH 23/25] flow_dissector: allow access only to a subset of __sk_buff fields ANBZ: #5530 commit 2ee7fba0d62d638d8b6dbe30cada3a531ec042af upstream. Use whitelist instead of a blacklist and allow only a small set of fields that might be relevant in the context of flow dissector: * data * data_end * flow_keys This is required for the eth_get_headlen case where we have only a chunk of data to dissect (i.e. trying to read the other skb fields doesn't make sense). Note, that it is a breaking API change! However, we've provided flow_keys->n_proto as a substitute for skb->protocol; and there is no need to manually handle skb->vlan_present. So even if we break somebody, the migration is trivial. Unfortunately, we can't support eth_get_headlen use-case without those breaking changes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- net/core/filter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c630f917d311..3e1613324362 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6850,14 +6850,8 @@ static bool flow_dissector_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) { - switch (off) { - case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - break; - default: - return false; - } - } + if (type == BPF_WRITE) + return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): @@ -6869,11 +6863,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; - case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): - case bpf_ctx_range(struct __sk_buff, wire_len): + default: return false; } -- Gitee From e56ba842747b53bbe960fbc2f347034aac4bc3ae Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:34 -0700 Subject: [PATCH 24/25] flow_dissector: document BPF flow dissector environment ANBZ: #5530 commit ae82899bbe92a7777ded9a562ee602dd5917bcd8 upstream. Short doc on what BPF flow dissector should expect in the input __sk_buff and flow_keys. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- .../networking/bpf_flow_dissector.txt | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 Documentation/networking/bpf_flow_dissector.txt diff --git a/Documentation/networking/bpf_flow_dissector.txt b/Documentation/networking/bpf_flow_dissector.txt new file mode 100644 index 000000000000..70f66a2d57e7 --- /dev/null +++ b/Documentation/networking/bpf_flow_dissector.txt @@ -0,0 +1,115 @@ +================== +BPF Flow Dissector +================== + +Overview +======== + +Flow dissector is a routine that parses metadata out of the packets. It's +used in the various places in the networking subsystem (RFS, flow hash, etc). + +BPF flow dissector is an attempt to reimplement C-based flow dissector logic +in BPF to gain all the benefits of BPF verifier (namely, limits on the +number of instructions and tail calls). + +API +=== + +BPF flow dissector programs operate on an __sk_buff. However, only the +limited set of fields is allowed: data, data_end and flow_keys. flow_keys +is 'struct bpf_flow_keys' and contains flow dissector input and +output arguments. + +The inputs are: + * nhoff - initial offset of the networking header + * thoff - initial offset of the transport header, initialized to nhoff + * n_proto - L3 protocol type, parsed out of L2 header + +Flow dissector BPF program should fill out the rest of the 'struct +bpf_flow_keys' fields. Input arguments nhoff/thoff/n_proto should be also +adjusted accordingly. + +The return code of the BPF program is either BPF_OK to indicate successful +dissection, or BPF_DROP to indicate parsing error. + +__sk_buff->data +=============== + +In the VLAN-less case, this is what the initial state of the BPF flow +dissector looks like: ++------+------+------------+-----------+ +| DMAC | SMAC | ETHER_TYPE | L3_HEADER | ++------+------+------------+-----------+ + ^ + | + +-- flow dissector starts here + +skb->data + flow_keys->nhoff point to the first byte of L3_HEADER. +flow_keys->thoff = nhoff +flow_keys->n_proto = ETHER_TYPE + + +In case of VLAN, flow dissector can be called with the two different states. + +Pre-VLAN parsing: ++------+------+------+-----+-----------+-----------+ +| DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | ++------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +skb->data + flow_keys->nhoff point the to first byte of TCI. +flow_keys->thoff = nhoff +flow_keys->n_proto = TPID + +Please note that TPID can be 802.1AD and, hence, BPF program would +have to parse VLAN information twice for double tagged packets. + + +Post-VLAN parsing: ++------+------+------+-----+-----------+-----------+ +| DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | ++------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +skb->data + flow_keys->nhoff point the to first byte of L3_HEADER. +flow_keys->thoff = nhoff +flow_keys->n_proto = ETHER_TYPE + +In this case VLAN information has been processed before the flow dissector +and BPF flow dissector is not required to handle it. + + +The takeaway here is as follows: BPF flow dissector program can be called with +the optional VLAN header and should gracefully handle both cases: when single +or double VLAN is present and when it is not present. The same program +can be called for both cases and would have to be written carefully to +handle both cases. + + +Reference Implementation +======================== + +See tools/testing/selftests/bpf/progs/bpf_flow.c for the reference +implementation and tools/testing/selftests/bpf/flow_dissector_load.[hc] for +the loader. bpftool can be used to load BPF flow dissector program as well. + +The reference implementation is organized as follows: +* jmp_table map that contains sub-programs for each supported L3 protocol +* _dissect routine - entry point; it does input n_proto parsing and does + bpf_tail_call to the appropriate L3 handler + +Since BPF at this point doesn't support looping (or any jumping back), +jmp_table is used instead to handle multiple levels of encapsulation (and +IPv6 options). + + +Current Limitations +=================== +BPF flow dissector doesn't support exporting all the metadata that in-kernel +C-based implementation can export. Notable example is single VLAN (802.1Q) +and double VLAN (802.1AD) tags. Please refer to the 'struct bpf_flow_keys' +for a set of information that's currently can be exported from the BPF context. -- Gitee From a9448b7e730620394f43cbea31204c18379f57a2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 3 Apr 2019 13:53:18 -0700 Subject: [PATCH 25/25] flow_dissector: rst'ify documentation ANBZ: #5530 commit 5eed7898626bedd6405421550c0c6e8ab9591bb2 upstream. Rename bpf_flow_dissector.txt to bpf_flow_dissector.rst and fix formatting. Also, link it from the Documentation/networking/index.rst. Tested with 'make htmldocs' to make sure it looks reasonable. Fixes: ae82899bbe92 ("flow_dissector: document BPF flow dissector environment") Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Yuanhe Shu --- ...w_dissector.txt => bpf_flow_dissector.rst} | 105 ++++++++++-------- Documentation/networking/index.rst | 1 + 2 files changed, 59 insertions(+), 47 deletions(-) rename Documentation/networking/{bpf_flow_dissector.txt => bpf_flow_dissector.rst} (43%) diff --git a/Documentation/networking/bpf_flow_dissector.txt b/Documentation/networking/bpf_flow_dissector.rst similarity index 43% rename from Documentation/networking/bpf_flow_dissector.txt rename to Documentation/networking/bpf_flow_dissector.rst index 70f66a2d57e7..b375ae2ec2c4 100644 --- a/Documentation/networking/bpf_flow_dissector.txt +++ b/Documentation/networking/bpf_flow_dissector.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ================== BPF Flow Dissector ================== @@ -15,19 +17,19 @@ number of instructions and tail calls). API === -BPF flow dissector programs operate on an __sk_buff. However, only the -limited set of fields is allowed: data, data_end and flow_keys. flow_keys -is 'struct bpf_flow_keys' and contains flow dissector input and -output arguments. +BPF flow dissector programs operate on an ``__sk_buff``. However, only the +limited set of fields is allowed: ``data``, ``data_end`` and ``flow_keys``. +``flow_keys`` is ``struct bpf_flow_keys`` and contains flow dissector input +and output arguments. The inputs are: - * nhoff - initial offset of the networking header - * thoff - initial offset of the transport header, initialized to nhoff - * n_proto - L3 protocol type, parsed out of L2 header + * ``nhoff`` - initial offset of the networking header + * ``thoff`` - initial offset of the transport header, initialized to nhoff + * ``n_proto`` - L3 protocol type, parsed out of L2 header -Flow dissector BPF program should fill out the rest of the 'struct -bpf_flow_keys' fields. Input arguments nhoff/thoff/n_proto should be also -adjusted accordingly. +Flow dissector BPF program should fill out the rest of the ``struct +bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be +also adjusted accordingly. The return code of the BPF program is either BPF_OK to indicate successful dissection, or BPF_DROP to indicate parsing error. @@ -36,48 +38,57 @@ __sk_buff->data =============== In the VLAN-less case, this is what the initial state of the BPF flow -dissector looks like: -+------+------+------------+-----------+ -| DMAC | SMAC | ETHER_TYPE | L3_HEADER | -+------+------+------------+-----------+ - ^ - | - +-- flow dissector starts here +dissector looks like:: + + +------+------+------------+-----------+ + | DMAC | SMAC | ETHER_TYPE | L3_HEADER | + +------+------+------------+-----------+ + ^ + | + +-- flow dissector starts here -skb->data + flow_keys->nhoff point to the first byte of L3_HEADER. -flow_keys->thoff = nhoff -flow_keys->n_proto = ETHER_TYPE +.. code:: c + + skb->data + flow_keys->nhoff point to the first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE In case of VLAN, flow dissector can be called with the two different states. -Pre-VLAN parsing: -+------+------+------+-----+-----------+-----------+ -| DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | -+------+------+------+-----+-----------+-----------+ - ^ - | - +-- flow dissector starts here +Pre-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here -skb->data + flow_keys->nhoff point the to first byte of TCI. -flow_keys->thoff = nhoff -flow_keys->n_proto = TPID +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of TCI + flow_keys->thoff = nhoff + flow_keys->n_proto = TPID Please note that TPID can be 802.1AD and, hence, BPF program would have to parse VLAN information twice for double tagged packets. -Post-VLAN parsing: -+------+------+------+-----+-----------+-----------+ -| DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | -+------+------+------+-----+-----------+-----------+ - ^ - | - +-- flow dissector starts here +Post-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c -skb->data + flow_keys->nhoff point the to first byte of L3_HEADER. -flow_keys->thoff = nhoff -flow_keys->n_proto = ETHER_TYPE + skb->data + flow_keys->nhoff point the to first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE In this case VLAN information has been processed before the flow dissector and BPF flow dissector is not required to handle it. @@ -93,14 +104,14 @@ handle both cases. Reference Implementation ======================== -See tools/testing/selftests/bpf/progs/bpf_flow.c for the reference -implementation and tools/testing/selftests/bpf/flow_dissector_load.[hc] for -the loader. bpftool can be used to load BPF flow dissector program as well. +See ``tools/testing/selftests/bpf/progs/bpf_flow.c`` for the reference +implementation and ``tools/testing/selftests/bpf/flow_dissector_load.[hc]`` +for the loader. bpftool can be used to load BPF flow dissector program as well. The reference implementation is organized as follows: -* jmp_table map that contains sub-programs for each supported L3 protocol -* _dissect routine - entry point; it does input n_proto parsing and does - bpf_tail_call to the appropriate L3 handler + * ``jmp_table`` map that contains sub-programs for each supported L3 protocol + * ``_dissect`` routine - entry point; it does input ``n_proto`` parsing and + does ``bpf_tail_call`` to the appropriate L3 handler Since BPF at this point doesn't support looping (or any jumping back), jmp_table is used instead to handle multiple levels of encapsulation (and @@ -111,5 +122,5 @@ Current Limitations =================== BPF flow dissector doesn't support exporting all the metadata that in-kernel C-based implementation can export. Notable example is single VLAN (802.1Q) -and double VLAN (802.1AD) tags. Please refer to the 'struct bpf_flow_keys' +and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` for a set of information that's currently can be exported from the BPF context. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index fcd710f2cc7a..609fd35b84d9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,6 +9,7 @@ Contents: netdev-FAQ af_xdp batman-adv + bpf_flow_dissector can can_ucan_protocol dpaa2/index -- Gitee