From 84a060c37e7940681d035a74e45224dba3eaf2b2 Mon Sep 17 00:00:00 2001 From: John Haxby Date: Fri, 16 Jun 2023 15:49:57 +0800 Subject: [PATCH 01/39] ipv6: fix restrict IPV6_ADDRFORM operation ANBZ: #5530 commit 82c9ae440857840c56e05d4fb1427ee032531346 upstream. Commit b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation") fixed a problem found by syzbot an unfortunate logic error meant that it also broke IPV6_ADDRFORM. Rearrange the checks so that the earlier test is just one of the series of checks made before moving the socket from IPv6 to IPv4. Fixes: b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation") Signed-off-by: John Haxby Cc: stable@vger.kernel.org Signed-off-by: David S. Miller Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv6/ipv6_sockglue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 667162acf844..5767e2e9b7b3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -193,15 +193,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } else if (sk->sk_protocol == IPPROTO_TCP) { - if (sk->sk_prot != &tcpv6_prot) { - retv = -EBUSY; - break; - } - break; - } else { + } + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; break; } + if (sk->sk_protocol != IPPROTO_TCP) + break; if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- Gitee From 00e9bf28628fa10e98974394e478e85c3414a810 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Fri, 16 Jun 2023 15:56:07 +0800 Subject: [PATCH 02/39] bpf: Fix sk_psock refcnt leak when receiving message ANBZ: #5530 commit 18f02ad19e2c2a1d9e1d55a4e1c0cbf51419151c upstream. tcp_bpf_recvmsg() invokes sk_psock_get(), which returns a reference of the specified sk_psock object to "psock" with increased refcnt. When tcp_bpf_recvmsg() returns, local variable "psock" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in several exception handling paths of tcp_bpf_recvmsg(). When those error scenarios occur such as "flags" includes MSG_ERRQUEUE, the function forgets to decrease the refcnt increased by sk_psock_get(), causing a refcnt leak. Fix this issue by calling sk_psock_put() or pulling up the error queue read handling when those error scenarios occur. Fixes: e7a5f1f1cd000 ("bpf/sockmap: Read psock ingress_msg before sk_receive_queue") Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1587872115-42805-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 30284cae4b2d..286455b70750 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -108,14 +108,17 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct sk_psock *psock; int copied, ret; + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } lock_sock(sk); msg_bytes_ready: copied = __tcp_bpf_recvmsg(sk, psock, msg, len); -- Gitee From f989cc5ccc067d250f67b94f47de972b87c19b4c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 16 Jun 2023 16:04:07 +0800 Subject: [PATCH 03/39] bpf, sockmap: bpf_tcp_ingress needs to subtract bytes from sg.size ANBZ: #5530 commit 81aabbb9fb7b4b1efd073b62f0505d3adad442f3 upstream. In bpf_tcp_ingress we used apply_bytes to subtract bytes from sg.size which is used to track total bytes in a message. But this is not correct because apply_bytes is itself modified in the main loop doing the mem_charge. Then at the end of this we have sg.size incorrectly set and out of sync with actual sk values. Then we can get a splat if we try to cork the data later and again try to redirect the msg to ingress. To fix instead of trying to track msg.size do the easy thing and include it as part of the sk_msg_xfer logic so that when the msg is moved the sg.size is always correct. To reproduce the below users will need ingress + cork and hit an error path that will then try to 'free' the skmsg. [ 173.699981] BUG: KASAN: null-ptr-deref in sk_msg_free_elem+0xdd/0x120 [ 173.699987] Read of size 8 at addr 0000000000000008 by task test_sockmap/5317 [ 173.700000] CPU: 2 PID: 5317 Comm: test_sockmap Tainted: G I 5.7.0-rc1+ #43 [ 173.700005] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [ 173.700009] Call Trace: [ 173.700021] dump_stack+0x8e/0xcb [ 173.700029] ? sk_msg_free_elem+0xdd/0x120 [ 173.700034] ? sk_msg_free_elem+0xdd/0x120 [ 173.700042] __kasan_report+0x102/0x15f [ 173.700052] ? sk_msg_free_elem+0xdd/0x120 [ 173.700060] kasan_report+0x32/0x50 [ 173.700070] sk_msg_free_elem+0xdd/0x120 [ 173.700080] __sk_msg_free+0x87/0x150 [ 173.700094] tcp_bpf_send_verdict+0x179/0x4f0 [ 173.700109] tcp_bpf_sendpage+0x3ce/0x5d0 Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/158861290407.14306.5327773422227552482.stgit@john-Precision-5820-Tower Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/linux/skmsg.h | 1 + net/ipv4/tcp_bpf.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3a357f91d515..0cbbeda20f64 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -175,6 +175,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; + src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 286455b70750..4ec01f76a784 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -190,7 +190,6 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); sk->sk_data_ready(sk); } else { -- Gitee From 3d427790e3277f7f6a9d060c0c21551c1bad5207 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 16 Jun 2023 16:08:13 +0800 Subject: [PATCH 04/39] ipv6: fix IPV6_ADDRFORM operation logic ANBZ: #5530 commit 79a1f0ccdbb4ad700590f61b00525b390cb53905 upstream. Socket option IPV6_ADDRFORM supports UDP/UDPLITE and TCP at present. Previously the checking logic looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol != IPPROTO_TCP) break; After commit b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation"), TCP was blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; break; else break; Then after commit 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") UDP/UDPLITE were blocked as the logic changed to: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; if (sk->sk_protocol == IPPROTO_TCP) do_some_check; if (sk->sk_protocol != IPPROTO_TCP) break; Fix it by using Eric's code and simply remove the break in TCP check, which looks like: if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) do_some_check; else if (sk->sk_protocol == IPPROTO_TCP) do_some_check; else break; Fixes: 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv6/ipv6_sockglue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 5767e2e9b7b3..e46bad9bc160 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -193,14 +193,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_prot != &tcpv6_prot) { - retv = -EBUSY; + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } + } else { break; } - if (sk->sk_protocol != IPPROTO_TCP) - break; + if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- Gitee From 08f294652609af578b0c19be54ec9342f3abc280 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 16 Jun 2023 16:28:56 +0800 Subject: [PATCH 05/39] bpf, sockhash: Fix memory leak when unlinking sockets in sock_hash_free ANBZ: #5530 commit 33a7c831565c43a7ee2f38c7df4c4a40e1dfdfed upstream. When sockhash gets destroyed while sockets are still linked to it, we will walk the bucket lists and delete the links. However, we are not freeing the list elements after processing them, leaking the memory. The leak can be triggered by close()'ing a sockhash map when it still contains sockets, and observed with kmemleak: unreferenced object 0xffff888116e86f00 (size 64): comm "race_sock_unlin", pid 223, jiffies 4294731063 (age 217.404s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 81 de e8 41 00 00 00 00 c0 69 2f 15 81 88 ff ff ...A.....i/..... backtrace: [<00000000dd089ebb>] sock_hash_update_common+0x4ca/0x760 [<00000000b8219bd5>] sock_hash_update_elem+0x1d2/0x200 [<000000005e2c23de>] __do_sys_bpf+0x2046/0x2990 [<00000000d0084618>] do_syscall_64+0xad/0x9a0 [<000000000d96f263>] entry_SYSCALL_64_after_hwframe+0x49/0xb3 Fix it by freeing the list element when we're done with it. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-2-jakub@cloudflare.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 402ebfe273a1..f44672172edb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -884,6 +884,7 @@ static void sock_hash_free(struct bpf_map *map) sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_hash_free_elem(htab, elem); } } -- Gitee From a61ffa53eb182c56291fb5719d494eb07f6ebbeb Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 16 Jun 2023 16:34:22 +0800 Subject: [PATCH 06/39] bpf, sockhash: Synchronize delete from bucket list on map free ANBZ: #5530 commit 75e68e5bf2c7fa9d3e874099139df03d5952a3e1 upstream. We can end up modifying the sockhash bucket list from two CPUs when a sockhash is being destroyed (sock_hash_free) on one CPU, while a socket that is in the sockhash is unlinking itself from it on another CPU it (sock_hash_delete_from_link). This results in accessing a list element that is in an undefined state as reported by KASAN: | ================================================================== | BUG: KASAN: wild-memory-access in sock_hash_free+0x13c/0x280 | Write of size 8 at addr dead000000000122 by task kworker/2:1/95 | | CPU: 2 PID: 95 Comm: kworker/2:1 Not tainted 5.7.0-rc7-02961-ge22c35ab0038-dirty #691 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x97/0xe0 | ? sock_hash_free+0x13c/0x280 | __kasan_report.cold+0x5/0x40 | ? mark_lock+0xbc1/0xc00 | ? sock_hash_free+0x13c/0x280 | kasan_report+0x38/0x50 | ? sock_hash_free+0x152/0x280 | sock_hash_free+0x13c/0x280 | bpf_map_free_deferred+0xb2/0xd0 | ? bpf_map_charge_finish+0x50/0x50 | ? rcu_read_lock_sched_held+0x81/0xb0 | ? rcu_read_lock_bh_held+0x90/0x90 | process_one_work+0x59a/0xac0 | ? lock_release+0x3b0/0x3b0 | ? pwq_dec_nr_in_flight+0x110/0x110 | ? rwlock_bug.part.0+0x60/0x60 | worker_thread+0x7a/0x680 | ? _raw_spin_unlock_irqrestore+0x4c/0x60 | kthread+0x1cc/0x220 | ? process_one_work+0xac0/0xac0 | ? kthread_create_on_node+0xa0/0xa0 | ret_from_fork+0x24/0x30 | ================================================================== Fix it by reintroducing spin-lock protected critical section around the code that removes the elements from the bucket on sockhash free. To do that we also need to defer processing of removed elements, until out of atomic context so that we can unlink the socket from the map when holding the sock lock. Fixes: 90db6d772f74 ("bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free") Reported-by: Eric Dumazet Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-3-jakub@cloudflare.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/core/sock_map.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f44672172edb..89eb94ce435d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -866,6 +866,7 @@ static void sock_hash_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct bpf_htab_bucket *bucket; + struct hlist_head unlink_list; struct bpf_htab_elem *elem; struct hlist_node *node; int i; @@ -877,13 +878,31 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - hlist_for_each_entry_safe(elem, node, &bucket->head, node) { - hlist_del_rcu(&elem->node); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + raw_spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } -- Gitee From 06b09b583267010f49e70304bd6f9f79aef11e39 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Jun 2023 16:39:05 +0800 Subject: [PATCH 07/39] bpf: tcp: Recv() should return 0 when the peer socket is closed ANBZ: #5530 commit 2c7269b231194aae23fb90ab65842573a91acbc9 upstream. If the peer is closed, we will never get more data, so tcp_bpf_wait_data will get stuck forever. In case we passed MSG_DONTWAIT to recv(), we get EAGAIN but we should actually get 0. >From man 2 recv: RETURN VALUE When a stream socket peer has performed an orderly shutdown, the return value will be 0 (the traditional "end-of-file" return). This patch makes tcp_bpf_wait_data always return 1 when the peer socket has been shutdown. Either we have data available, and it would have returned 1 anyway, or there isn't, in which case we'll call tcp_recvmsg which does the right thing in this situation. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Sabrina Dubroca Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/26038a28c21fea5d04d4bd4744c5686d3f2e5504.1591784177.git.sd@queasysnail.net Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 4ec01f76a784..98df24756a95 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -28,6 +28,9 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + if (!timeo) return ret; -- Gitee From 0bdd4652f4bad5e36fdb4b4de6446a20730288d6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 19 Jun 2023 10:58:51 +0800 Subject: [PATCH 08/39] bpf, sockmap, tcp: sk_prot needs inuse_idx set for proc stats ANBZ: #5530 commit 228a4a7ba8e99bb9ef980b62f71e3be33f4aae69 upstream. The proc socket stats use sk_prot->inuse_idx value to record inuse sock stats. We currently do not set this correctly from sockmap side. The result is reading sock stats '/proc/net/sockstat' gives incorrect values. The socket counter is incremented correctly, but because we don't set the counter correctly when we replace sk_prot we may omit the decrement. To get the correct inuse_idx value move the core_initcall that initializes the TCP proto handlers to late_initcall. This way it is initialized after TCP has the chance to assign the inuse_idx value from the register protocol handler. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Cong Wang Link: https://lore.kernel.org/bpf/20210712195546.423990-3-john.fastabend@gmail.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 98df24756a95..db519b9d0dfb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -597,7 +597,7 @@ static int __init tcp_bpf_v4_build_proto(void) tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } -core_initcall(tcp_bpf_v4_build_proto); +late_initcall(tcp_bpf_v4_build_proto); static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) { -- Gitee From 46ce86da5090261733c6dc597a43b0fe6a1ddb42 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Mon, 19 Jun 2023 11:15:46 +0800 Subject: [PATCH 09/39] tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function ANBZ: #5530 commit cd9733f5d75c94a32544d6ce5be47e14194cf137 upstream. With two Msgs, msgA and msgB and a user doing nonblocking sendmsg calls (or multiple cores) on a single socket 'sk' we could get the following flow. msgA, sk msgB, sk ----------- --------------- tcp_bpf_sendmsg() lock(sk) psock = sk->psock tcp_bpf_sendmsg() lock(sk) ... blocking tcp_bpf_send_verdict if (psock->eval == NONE) psock->eval = sk_psock_msg_verdict .. < handle SK_REDIRECT case > release_sock(sk) < lock dropped so grab here > ret = tcp_bpf_sendmsg_redir psock = sk->psock tcp_bpf_send_verdict lock_sock(sk) ... blocking on B if (psock->eval == NONE) <- boom. psock->eval will have msgA state The problem here is we dropped the lock on msgA and grabbed it with msgB. Now we have old state in psock and importantly psock->eval has not been cleared. So msgB will run whatever action was done on A and the verdict program may never see it. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Liu Jian Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211012052019.184398-1-liujian56@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index db519b9d0dfb..546ff3428a2a 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -286,6 +286,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; u32 tosend; + u32 eval = __SK_NONE; int ret; more_data: @@ -321,13 +322,24 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, case __SK_REDIRECT: sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); + if (!psock->apply_bytes) { + /* Clean up before releasing the sock lock. */ + eval = psock->eval; + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } if (psock->cork) { cork = true; psock->cork = NULL; } sk_msg_return(sk, msg, tosend); release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + + if (eval == __SK_REDIRECT) + sock_put(sk_redir); + lock_sock(sk); if (unlikely(ret < 0)) { int free = sk_msg_free_nocharge(sk, msg); -- Gitee From cd746d39dc54917e9d2d2a15428ae05a4ddcf14c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 19 Jun 2023 11:38:13 +0800 Subject: [PATCH 10/39] bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding ANBZ: #5530 commit e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 upstream. Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling progress, e.g. offset and length of the skb. First this is poorly named and inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at this layer. But, more importantly strparser is using the following to access its metadata. (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)) Where _strp_msg is defined as: struct _strp_msg { struct strp_msg strp; /* 0 8 */ int accum_len; /* 8 4 */ /* size: 12, cachelines: 1, members: 2 */ /* last cacheline: 12 bytes */ }; So we use 12 bytes of ->data[] in struct. However in BPF code running parser and verdict the user has read capabilities into the data[] array as well. Its not too problematic, but we should not be exposing internal state to BPF program. If its really needed then we can use the probe_read() APIs which allow reading kernel memory. And I don't believe cb[] layer poses any API breakage by moving this around because programs can't depend on cb[] across layers. In order to fix another issue with a ctx rewrite we need to stash a temp variable somewhere. To make this work cleanly this patch builds a cb struct for sk_skb types called sk_skb_cb struct. Then we can use this consistently in the strparser, sockmap space. Additionally we can start allowing ->cb[] Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/net/strparser.h | 16 +++++++++++++++- net/core/filter.c | 19 +++++++++++++++++++ net/strparser/strparser.c | 10 +--------- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index f177c87ce38b..ef0822d110a8 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -57,10 +57,24 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + struct _strp_msg strp; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ diff --git a/net/core/filter.c b/net/core/filter.c index d85e3c6f29d2..aab7889860c2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6850,6 +6850,25 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 0f4e42792878..e57d25191c4d 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -29,18 +29,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ -- Gitee From f20f038c8f63b51877a902aabcf0eb2e9d1e8823 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 19 Jun 2023 11:43:17 +0800 Subject: [PATCH 11/39] bpf, sockmap: Fix memleak in tcp_bpf_sendmsg while sk msg is full ANBZ: #5530 commit 9c34e38c4a870eb30b13f42f5b44f42e9d19ccb8 upstream. If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc() returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is greater than osize after sk_msg_alloc(), memleak occurs. To fix we use sk_msg_trim() to release the allocated memory, then goto wait for memory. Other call paths of sk_msg_alloc() have the similar issue, such as tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(), as Cong Wang suggested. This issue can cause the following info: WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0 Call Trace: inet_csk_destroy_sock+0x55/0x110 __tcp_close+0x279/0x470 tcp_close+0x1f/0x60 inet_release+0x3f/0x80 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x92/0x250 task_work_run+0x6a/0xa0 do_exit+0x33b/0xb60 do_group_exit+0x2f/0xa0 get_signal+0xb6/0x950 arch_do_signal_or_restart+0xac/0x2a0 exit_to_user_mode_prepare+0xa9/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 kthread+0xe6/0x110 ret_from_fork+0x22/0x30 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/core/skmsg.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ed3de15d8c87..1d30115150b1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -26,6 +26,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; @@ -34,13 +35,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, u32 orig_offset; int use, i; - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); - if (!sk_wmem_schedule(sk, use)) - return -ENOMEM; + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + } i = msg->sg.end; sk_msg_iter_var_prev(i); @@ -70,6 +75,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, } return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); -- Gitee From 8f8896593a2aafcc983dec46361710ed887a5524 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 19 Jun 2023 13:13:51 +0800 Subject: [PATCH 12/39] bpf, sockmap: Fix more uncharged while msg has more_data ANBZ: #5530 commit 84472b436e760ba439e1969a9e3c5ae7c86de39d upstream. In tcp_bpf_send_verdict(), if msg has more data after tcp_bpf_sendmsg_redir(): tcp_bpf_send_verdict() tosend = msg->sg.size //msg->sg.size = 22220 case __SK_REDIRECT: sk_msg_return() //uncharged msg->sg.size(22220) sk->sk_forward_alloc tcp_bpf_sendmsg_redir() //after tcp_bpf_sendmsg_redir, msg->sg.size=11000 goto more_data; tosend = msg->sg.size //msg->sg.size = 11000 case __SK_REDIRECT: sk_msg_return() //uncharged msg->sg.size(11000) to sk->sk_forward_alloc The msg->sg.size(11000) has been uncharged twice, to fix we can charge the remaining msg->sg.size before goto more data. This issue can cause the following info: WARNING: CPU: 0 PID: 9860 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0 Call Trace: inet_csk_destroy_sock+0x55/0x110 __tcp_close+0x279/0x470 tcp_close+0x1f/0x60 inet_release+0x3f/0x80 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x92/0x250 task_work_run+0x6a/0xa0 do_exit+0x33b/0xb60 do_group_exit+0x2f/0xa0 get_signal+0xb6/0x950 arch_do_signal_or_restart+0xac/0x2a0 ? vfs_write+0x237/0x290 exit_to_user_mode_prepare+0xa9/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae WARNING: CPU: 0 PID: 2136 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-4-wangyufen@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 546ff3428a2a..a0f1c2eb4272 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -332,7 +332,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, tosend); + sk_msg_return(sk, msg, msg->sg.size); release_sock(sk); ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); @@ -372,8 +372,11 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, } if (msg && msg->sg.data[msg->sg.start].page_link && - msg->sg.data[msg->sg.start].length) + msg->sg.data[msg->sg.start].length) { + if (eval == __SK_REDIRECT) + sk_mem_charge(sk, msg->sg.size); goto more_data; + } } return ret; } -- Gitee From 7c57a6f2714dcbb01c7159ccd68a040703b2c33b Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 19 Jun 2023 13:22:20 +0800 Subject: [PATCH 13/39] bpf, sockmap: Fix double uncharge the mem of sk_msg ANBZ: #5530 commit 2486ab434b2c2a14e9237296db00b1e1b7ae3273 upstream. If tcp_bpf_sendmsg is running during a tear down operation, psock may be freed. tcp_bpf_sendmsg() tcp_bpf_send_verdict() sk_msg_return() tcp_bpf_sendmsg_redir() unlikely(!psock)) sk_msg_free() The mem of msg has been uncharged in tcp_bpf_send_verdict() by sk_msg_return(), and would be uncharged by sk_msg_free() again. When psock is null, we can simply returning an error code, this would then trigger the sk_msg_free_nocharge in the error path of __SK_REDIRECT and would have the side effect of throwing an error up to user space. This would be a slight change in behavior from user side but would look the same as an error if the redirect on the socket threw an error. This issue can cause the following info: WARNING: CPU: 0 PID: 2136 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-5-wangyufen@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a0f1c2eb4272..e6adca0b9efa 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -269,10 +269,9 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, struct sk_psock *psock = sk_psock_get(sk); int ret; - if (unlikely(!psock)) { - sk_msg_free(sk, msg); - return 0; - } + if (unlikely(!psock)) + return -EPIPE; + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); -- Gitee From be6a5eed472282645c0e4838c673affe2654629e Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Mon, 19 Jun 2023 13:42:18 +0800 Subject: [PATCH 14/39] bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues ANBZ: #5530 commit 8ec95b94716a1e4d126edc3fb2bc426a717e2dba upstream. When running `test_sockmap` selftests, the following warning appears: WARNING: CPU: 2 PID: 197 at net/core/stream.c:205 sk_stream_kill_queues+0xd3/0xf0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xd28/0x1380 ? tcp_v4_do_rcv+0x77/0x2c0 tcp_v4_do_rcv+0x77/0x2c0 __release_sock+0x106/0x130 __tcp_close+0x1a7/0x4e0 tcp_close+0x20/0x70 inet_release+0x3c/0x80 __sock_release+0x3a/0xb0 sock_close+0x14/0x20 __fput+0xa3/0x260 task_work_run+0x59/0xb0 exit_to_user_mode_prepare+0x1b3/0x1c0 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x48/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root case is in commit 84472b436e76 ("bpf, sockmap: Fix more uncharged while msg has more_data"), where I used msg->sg.size to replace the tosend, causing breakage: if (msg->apply_bytes && msg->apply_bytes < tosend) tosend = psock->apply_bytes; Fixes: 84472b436e76 ("bpf, sockmap: Fix more uncharged while msg has more_data") Reported-by: Jakub Sitnicki Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1667266296-8794-1-git-send-email-wangyufen@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index e6adca0b9efa..263c25b2c7aa 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -284,7 +284,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, origsize, sent; u32 eval = __SK_NONE; int ret; @@ -331,10 +331,12 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, msg->sg.size); + sk_msg_return(sk, msg, tosend); release_sock(sk); + origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); @@ -373,7 +375,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) { if (eval == __SK_REDIRECT) - sk_mem_charge(sk, msg->sg.size); + sk_mem_charge(sk, tosend - sent); goto more_data; } } -- Gitee From 5a7ee173723987cd237d2222955993efc7dea9f6 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Mon, 19 Jun 2023 13:56:53 +0800 Subject: [PATCH 15/39] bpf, sockmap: Fix repeated calls to sock_put() when msg has more_data ANBZ: #5530 commit 7a9841ca025275b5b0edfb0b618934abb6ceec15 upstream. In tcp_bpf_send_verdict() redirection, the eval variable is assigned to __SK_REDIRECT after the apply_bytes data is sent, if msg has more_data, sock_put() will be called multiple times. We should reset the eval variable to __SK_NONE every time more_data starts. This causes: IPv4: Attempt to release TCP socket in state 1 00000000b4c925d7 ------------[ cut here ]------------ refcount_t: addition on 0; use-after-free. WARNING: CPU: 5 PID: 4482 at lib/refcount.c:25 refcount_warn_saturate+0x7d/0x110 Modules linked in: CPU: 5 PID: 4482 Comm: sockhash_bypass Kdump: loaded Not tainted 6.0.0 #1 Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014 Call Trace: __tcp_transmit_skb+0xa1b/0xb90 ? __alloc_skb+0x8c/0x1a0 ? __kmalloc_node_track_caller+0x184/0x320 tcp_write_xmit+0x22a/0x1110 __tcp_push_pending_frames+0x32/0xf0 do_tcp_sendpages+0x62d/0x640 tcp_bpf_push+0xae/0x2c0 tcp_bpf_sendmsg_redir+0x260/0x410 ? preempt_count_add+0x70/0xa0 tcp_bpf_send_verdict+0x386/0x4b0 tcp_bpf_sendmsg+0x21b/0x3b0 sock_sendmsg+0x58/0x70 __sys_sendto+0xfa/0x170 ? xfd_validate_state+0x1d/0x80 ? switch_fpu_return+0x59/0xe0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: cd9733f5d75c ("tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-2-git-send-email-yangpc@wangsu.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 263c25b2c7aa..da70b4f4eda0 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -285,7 +285,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; u32 tosend, origsize, sent; - u32 eval = __SK_NONE; + u32 eval; int ret; more_data: @@ -308,6 +308,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: -- Gitee From 2d6087c3801c77fd2d9faa94930771abe772cf48 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Mon, 19 Jun 2023 14:02:34 +0800 Subject: [PATCH 16/39] bpf, sockmap: Fix data loss caused by using apply_bytes on ingress redirect ANBZ: #5530 commit 9072931f020bfd907d6d89ee21ff1481cd78b407 upstream. Use apply_bytes on ingress redirect, when apply_bytes is less than the length of msg data, some data may be skipped and lost in bpf_tcp_ingress(). If there is still data in the scatterlist that has not been consumed, we cannot move the msg iter. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-4-git-send-email-yangpc@wangsu.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index da70b4f4eda0..116484c92d40 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -186,8 +186,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); -- Gitee From 1f4ae0630f1380ec6930ffa69c36501c513d7783 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Jun 2023 14:08:34 +0800 Subject: [PATCH 17/39] bpf, sockmap: fix race in sock_map_free() ANBZ: #5530 commit 0a182f8d607464911756b4dbef5d6cad8de22469 upstream. sock_map_free() calls release_sock(sk) without owning a reference on the socket. This can cause use-after-free as syzbot found [1] Jakub Sitnicki already took care of a similar issue in sock_hash_free() in commit 75e68e5bf2c7 ("bpf, sockhash: Synchronize delete from bucket list on map free") [1] refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 3785 at lib/refcount.c:31 refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 3785 Comm: kworker/u4:6 Not tainted 6.1.0-rc7-syzkaller-00103-gef4d3ea40565 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Code: 68 8b 31 c0 e8 75 71 15 fd 0f 0b e9 64 ff ff ff e8 d9 6e 4e fd c6 05 62 9c 3d 0a 01 48 c7 c7 80 bb 68 8b 31 c0 e8 54 71 15 fd <0f> 0b e9 43 ff ff ff 89 d9 80 e1 07 80 c1 03 38 c1 0f 8c a2 fe ff RSP: 0018:ffffc9000456fb60 EFLAGS: 00010246 RAX: eae59bab72dcd700 RBX: 0000000000000004 RCX: ffff8880207057c0 RDX: 0000000000000000 RSI: 0000000000000201 RDI: 0000000000000000 RBP: 0000000000000004 R08: ffffffff816fdabd R09: fffff520008adee5 R10: fffff520008adee5 R11: 1ffff920008adee4 R12: 0000000000000004 R13: dffffc0000000000 R14: ffff88807b1c6c00 R15: 1ffff1100f638dcf FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b30c30000 CR3: 000000000d08e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] __sock_put include/net/sock.h:779 [inline] tcp_release_cb+0x2d0/0x360 net/ipv4/tcp_output.c:1092 release_sock+0xaf/0x1c0 net/core/sock.c:3468 sock_map_free+0x219/0x2c0 net/core/sock_map.c:356 process_one_work+0x81c/0xd10 kernel/workqueue.c:2289 worker_thread+0xb14/0x1330 kernel/workqueue.c:2436 kthread+0x266/0x300 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jakub Sitnicki Cc: John Fastabend Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/r/20221202111640.2745533-1-edumazet@google.com Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- net/core/sock_map.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 89eb94ce435d..79f39f8e22b5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -249,11 +249,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { + sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); + sock_put(sk); } } -- Gitee From f8c66b9b2ee741d64a898ba69c64641b8c9177ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 19 Jun 2023 14:35:24 +0800 Subject: [PATCH 18/39] bpf: bpftool, add support for attaching programs to maps ANBZ: #5530 commit b7d3826c2ed6c3e626e7ae796c5df2c0d2551c6a upstream. Sock map/hash introduce support for attaching programs to maps. To date I have been doing this with custom tooling but this is less than ideal as we shift to using bpftool as the single CLI for our BPF uses. This patch adds new sub commands 'attach' and 'detach' to the 'prog' command to attach programs to maps and then detach them. Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- .../bpftool/Documentation/bpftool-prog.rst | 11 +++ tools/bpf/bpftool/Documentation/bpftool.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 19 +++- tools/bpf/bpftool/prog.c | 99 ++++++++++++++++++- 4 files changed, 128 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 64156a16d530..12c803003ab2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -25,6 +25,8 @@ MAP COMMANDS | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes**}] | **bpftool** **prog pin** *PROG* *FILE* | **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP* +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP* | **bpftool** **prog help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -37,6 +39,7 @@ MAP COMMANDS | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | } +| *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** } DESCRIPTION @@ -90,6 +93,14 @@ DESCRIPTION Note: *FILE* must be located in *bpffs* mount. + **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP* + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + to the map *MAP*. + + **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP* + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + from the map *MAP*. + **bpftool prog help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 8dda77daeda9..25c08728ca80 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -26,7 +26,7 @@ SYNOPSIS | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** - | **load** | **help** } + | **load** | **attach** | **detach** | **help** } *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index ec577af2e5f0..8232d502979f 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -292,6 +292,23 @@ _bpftool() fi return 0 ;; + attach|detach) + if [[ ${#words[@]} == 7 ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + + if [[ ${#words[@]} == 6 ]]; then + COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) ) + return 0 + fi + + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + return 0 + ;; load) local obj @@ -347,7 +364,7 @@ _bpftool() ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'dump help pin load \ + COMPREPLY=( $( compgen -W 'dump help pin attach detach load \ show list' -- "$cur" ) ) ;; esac diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 108295f97d09..c26628e854d4 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -77,6 +77,26 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", }; +static const char * const attach_type_strings[] = { + [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_MSG_VERDICT] = "msg_verdict", + [__MAX_BPF_ATTACH_TYPE] = NULL, +}; + +enum bpf_attach_type parse_attach_type(const char *str) +{ + enum bpf_attach_type type; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + if (attach_type_strings[type] && + is_prefix(str, attach_type_strings[type])) + return type; + } + + return __MAX_BPF_ATTACH_TYPE; +} + static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) { struct timespec real_time_ts, boot_time_ts; @@ -700,6 +720,77 @@ int map_replace_compar(const void *p1, const void *p2) return a->idx - b->idx; } +static int do_attach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map attach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_attach(progfd, mapfd, attach_type, 0); + if (err) { + p_err("failed prog attach to map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + +static int do_detach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map detach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_detach2(progfd, mapfd, attach_type); + if (err) { + p_err("failed prog detach from map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} static int do_load(int argc, char **argv) { enum bpf_attach_type expected_attach_type; @@ -949,6 +1040,8 @@ static int do_help(int argc, char **argv) " %s %s pin PROG FILE\n" " %s %s load OBJ FILE [type TYPE] [dev NAME] \\\n" " [map { idx IDX | name NAME } MAP]\n" + " %s %s attach PROG ATTACH_TYPE MAP\n" + " %s %s detach PROG ATTACH_TYPE MAP\n" " %s %s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -960,10 +1053,12 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" + " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -975,6 +1070,8 @@ static const struct cmd cmds[] = { { "dump", do_dump }, { "pin", do_pin }, { "load", do_load }, + { "attach", do_attach }, + { "detach", do_detach }, { 0 } }; -- Gitee From 91f19dbc46690fde741a3ef48b7188cafb5d32ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 19 Jun 2023 14:38:33 +0800 Subject: [PATCH 19/39] bpf: bpftool, add flag to allow non-compat map definitions ANBZ: #5530 commit c034a177d3c898f370f52877e7252da8c4f8235c upstream. Multiple map definition structures exist and user may have non-zero fields in their definition that are not recognized by bpftool and libbpf. The normal behavior is to then fail loading the map. Although this is a good default behavior users may still want to load the map for debugging or other reasons. This patch adds a --mapcompat flag that can be used to override the default behavior and allow loading the map even when it has additional non-zero fields. For now the only user is 'bpftool prog' we can switch over other subcommands as needed. The library exposes an API that consumes a flags field now but I kept the original API around also in case users of the API don't want to expose this. The flags field is an int in case we need more control over how the API call handles errors/features/etc in the future. Signed-off-by: John Fastabend Acked-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/bpf/bpftool/Documentation/bpftool.rst | 4 +++ tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/main.c | 7 +++++- tools/bpf/bpftool/main.h | 3 ++- tools/bpf/bpftool/prog.c | 2 +- tools/lib/bpf/bpf.h | 3 +++ tools/lib/bpf/libbpf.c | 27 ++++++++++++++------- tools/lib/bpf/libbpf.h | 2 ++ 8 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 25c08728ca80..65488317fefa 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -57,6 +57,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -m, --mapcompat + Allow loading maps with unknown map definitions. + + SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 8232d502979f..70ab5485d7d2 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -184,7 +184,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then - local c='--version --json --pretty --bpffs' + local c='--version --json --pretty --bpffs --mapcompat' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 79dc3f193547..828dde30e9ec 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -55,6 +55,7 @@ json_writer_t *json_wtr; bool pretty_output; bool json_output; bool show_pinned; +int bpf_flags; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; @@ -341,6 +342,7 @@ int main(int argc, char **argv) { "pretty", no_argument, NULL, 'p' }, { "version", no_argument, NULL, 'V' }, { "bpffs", no_argument, NULL, 'f' }, + { "mapcompat", no_argument, NULL, 'm' }, { 0 } }; int opt, ret; @@ -355,7 +357,7 @@ int main(int argc, char **argv) hash_init(map_table.table); opterr = 0; - while ((opt = getopt_long(argc, argv, "Vhpjf", + while ((opt = getopt_long(argc, argv, "Vhpjfm", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -379,6 +381,9 @@ int main(int argc, char **argv) case 'f': show_pinned = true; break; + case 'm': + bpf_flags = MAPS_RELAX_COMPAT; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 38413a16ab92..13719dc994a0 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -74,7 +74,7 @@ #define HELP_SPEC_PROGRAM \ "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }" #define HELP_SPEC_OPTIONS \ - "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} }" + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} | {-m|--mapcompat}" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE }" @@ -89,6 +89,7 @@ extern const char *bin_name; extern json_writer_t *json_wtr; extern bool json_output; extern bool show_pinned; +extern int bpf_flags; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index c26628e854d4..70f35e9296b2 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -915,7 +915,7 @@ static int do_load(int argc, char **argv) } } - obj = bpf_object__open_xattr(&attr); + obj = __bpf_object__open_xattr(&attr, bpf_flags); if (IS_ERR_OR_NULL(obj)) { p_err("failed to open object file"); goto err_free_reuse_maps; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index f57a2a09761d..20380ecac9e8 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -70,6 +70,9 @@ struct bpf_load_program_attr { __u32 prog_ifindex; }; +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE (256 * 1024) int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 22f2702a4144..85c66abad69a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -564,8 +564,9 @@ static int compare_bpf_map(const void *_a, const void *_b) } static int -bpf_object__init_maps(struct bpf_object *obj) +bpf_object__init_maps(struct bpf_object *obj, int flags) { + bool strict = !(flags & MAPS_RELAX_COMPAT); int i, map_idx, map_def_sz, nr_maps = 0; Elf_Scn *scn; Elf_Data *data; @@ -687,7 +688,8 @@ bpf_object__init_maps(struct bpf_object *obj) "has unrecognized, non-zero " "options\n", obj->path, map_name); - return -EINVAL; + if (strict) + return -EINVAL; } } memcpy(&obj->maps[map_idx].def, def, @@ -718,7 +720,7 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx) return false; } -static int bpf_object__elf_collect(struct bpf_object *obj) +static int bpf_object__elf_collect(struct bpf_object *obj, int flags) { Elf *elf = obj->efile.elf; GElf_Ehdr *ep = &obj->efile.ehdr; @@ -845,7 +847,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) return LIBBPF_ERRNO__FORMAT; } if (obj->efile.maps_shndx >= 0) { - err = bpf_object__init_maps(obj); + err = bpf_object__init_maps(obj, flags); if (err) goto out; } @@ -1517,7 +1519,7 @@ static int bpf_object__validate(struct bpf_object *obj, bool needs_kver) static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, - bool needs_kver) + bool needs_kver, int flags) { struct bpf_object *obj; int err; @@ -1533,7 +1535,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, CHECK_ERR(bpf_object__elf_init(obj), err, out); CHECK_ERR(bpf_object__check_endianness(obj), err, out); - CHECK_ERR(bpf_object__elf_collect(obj), err, out); + CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out); CHECK_ERR(bpf_object__collect_reloc(obj), err, out); CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out); @@ -1544,7 +1546,8 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, return ERR_PTR(err); } -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags) { /* param validation */ if (!attr->file) @@ -1553,7 +1556,13 @@ struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) pr_debug("loading %s\n", attr->file); return __bpf_object__open(attr->file, NULL, 0, - bpf_prog_type__needs_kver(attr->prog_type)); + bpf_prog_type__needs_kver(attr->prog_type), + flags); +} + +struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +{ + return __bpf_object__open_xattr(attr, 0); } struct bpf_object *bpf_object__open(const char *path) @@ -1586,7 +1595,7 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, pr_debug("loading object '%s' from buffer\n", name); - return __bpf_object__open(name, obj_buf, obj_buf_sz, true); + return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true); } int bpf_object__unload(struct bpf_object *obj) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8af8d3663991..7e9c801a9fdd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -61,6 +61,8 @@ struct bpf_object_open_attr { struct bpf_object *bpf_object__open(const char *path); struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr); +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags); struct bpf_object *bpf_object__open_buffer(void *obj_buf, size_t obj_buf_sz, const char *name); -- Gitee From 7528948e7f156bf67847d27e4824f9d53846a950 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 19 Jun 2023 15:20:30 +0800 Subject: [PATCH 20/39] tools: bpftool: fix bash completion for bpftool prog (attach|detach) ANBZ: #5530 commit cad4977344b35ea116ec5fefe91a76b1dfa113f5 upstream. Fix bash completion for "bpftool prog (attach|detach) PROG TYPE MAP" so that the list of indices proposed for MAP are map indices, and not PROG indices. Also use variables for map and prog reference types ("id", "pinned", and "tag" for programs). Fixes: b7d3826c2ed6 ("bpf: bpftool, add support for attaching programs to maps") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/bpf/bpftool/bash-completion/bpftool | 72 +++++++++++++++-------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 70ab5485d7d2..953fc8385547 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -243,16 +243,20 @@ _bpftool() # Completion depends on object and command in use case $object in prog) - if [[ $command != "load" ]]; then - case $prev in - id) - _bpftool_get_prog_ids - return 0 - ;; - esac - fi + # Complete id, only for subcommands that use prog (but no map) ids + case $command in + show|list|dump|pin) + case $prev in + id) + _bpftool_get_prog_ids + return 0 + ;; + esac + ;; + esac local PROG_TYPE='id pinned tag' + local MAP_TYPE='id pinned' case $command in show|list) [[ $prev != "$command" ]] && return 0 @@ -293,21 +297,43 @@ _bpftool() return 0 ;; attach|detach) - if [[ ${#words[@]} == 7 ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) - return 0 - fi - - if [[ ${#words[@]} == 6 ]]; then - COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) ) - return 0 - fi - - if [[ $prev == "$command" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) - return 0 - fi - return 0 + case $cword in + 3) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + 4) + case $prev in + id) + _bpftool_get_prog_ids + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + 5) + COMPREPLY=( $( compgen -W 'msg_verdict skb_verdict \ + skb_parse flow_dissector' -- "$cur" ) ) + return 0 + ;; + 6) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + 7) + case $prev in + id) + _bpftool_get_map_ids + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + esac ;; load) local obj -- Gitee From 81d5f87cb3e865878325706f20b5a82e2b3be459 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Jun 2023 15:27:24 +0800 Subject: [PATCH 21/39] tools: bpftool: add map create command ANBZ: #5530 commit 0b592b5a01bef5416472ec610d3191e019c144a5 upstream. Add a way of creating maps from user space. The command takes as parameters most of the attributes of the map creation system call command. After map is created its pinned to bpffs. This makes it possible to easily and dynamically (without rebuilding programs) test various corner cases related to map creation. Map type names are taken from bpftool's array used for printing. In general these days we try to make use of libbpf type names, but there are no map type names in libbpf as of today. As with most features I add the motivation is testing (offloads) :) Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- .../bpf/bpftool/Documentation/bpftool-map.rst | 15 ++- tools/bpf/bpftool/Documentation/bpftool.rst | 4 +- tools/bpf/bpftool/bash-completion/bpftool | 38 +++++- tools/bpf/bpftool/common.c | 21 ++++ tools/bpf/bpftool/main.h | 1 + tools/bpf/bpftool/map.c | 110 +++++++++++++++++- 6 files changed, 183 insertions(+), 6 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index a6258bc8ec4f..3497f2d80328 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -15,13 +15,15 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } *COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **help** } MAP COMMANDS ============= | **bpftool** **map { show | list }** [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **bpftool** **map dump** *MAP* | **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] | **bpftool** **map lookup** *MAP* **key** *DATA* @@ -36,6 +38,11 @@ MAP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** } DESCRIPTION =========== @@ -47,6 +54,10 @@ DESCRIPTION Output will start with map ID followed by map type and zero or more named attributes (depending on kernel version). + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* + as *FILE*. + **bpftool map dump** *MAP* Dump all entries in a given *MAP*. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 65488317fefa..04cd4f92ab89 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -22,8 +22,8 @@ SYNOPSIS | { **-j** | **--json** } [{ **-p** | **--pretty** }] } *MAP-COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **event_pipe** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | **load** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 953fc8385547..9273c4afbc82 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -413,6 +413,42 @@ _bpftool() ;; esac ;; + create) + case $prev in + $command) + _filedir + return 0 + ;; + type) + COMPREPLY=( $( compgen -W 'hash array prog_array \ + perf_event_array percpu_hash percpu_array \ + stack_trace cgroup_array lru_hash \ + lru_percpu_hash lpm_trie array_of_maps \ + hash_of_maps devmap sockmap cpumap xskmap \ + sockhash cgroup_storage reuseport_sockarray \ + percpu_cgroup_storage' -- \ + "$cur" ) ) + return 0 + ;; + key|value|flags|name|entries) + return 0 + ;; + dev) + _sysfs_get_netdevs + return 0 + ;; + *) + _bpftool_once_attr 'type' + _bpftool_once_attr 'key' + _bpftool_once_attr 'value' + _bpftool_once_attr 'entries' + _bpftool_once_attr 'name' + _bpftool_once_attr 'flags' + _bpftool_once_attr 'dev' + return 0 + ;; + esac + ;; lookup|getnext|delete) case $prev in $command) @@ -526,7 +562,7 @@ _bpftool() *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin event_pipe show list update' -- \ + lookup pin event_pipe show list update create' -- \ "$cur" ) ) ;; esac diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 158469f57461..aeb4b7035151 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -619,3 +619,24 @@ void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) jsonw_string_field(json_wtr, "ifname", name); jsonw_end_object(json_wtr); } + +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what) +{ + char *endptr; + + NEXT_ARGP(); + + if (*val) { + p_err("%s already specified", what); + return -1; + } + + *val = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as %s", **argv, what); + return -1; + } + NEXT_ARGP(); + + return 0; +} diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 13719dc994a0..49266f93b856 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -139,6 +139,7 @@ int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); int do_net(int argc, char **arg); +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv); int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 37ca559e49cc..4d8824304d0b 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,17 @@ static bool map_is_map_of_progs(__u32 type) return type == BPF_MAP_TYPE_PROG_ARRAY; } +static int map_type_from_str(const char *type) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(map_type_name); i++) + /* Don't allow prefixing in case of possible future shadowing */ + if (map_type_name[i] && !strcmp(map_type_name[i], type)) + return i; + return -1; +} + static void *alloc_value(struct bpf_map_info *info) { if (map_is_per_cpu(info->type)) @@ -1045,6 +1057,92 @@ static int do_pin(int argc, char **argv) return err; } +static int do_create(int argc, char **argv) +{ + struct bpf_create_map_attr attr = { NULL, }; + const char *pinfile; + int err, fd; + + if (!REQ_ARGS(7)) + return -1; + pinfile = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "type")) { + NEXT_ARG(); + + if (attr.map_type) { + p_err("map type already specified"); + return -1; + } + + attr.map_type = map_type_from_str(*argv); + if ((int)attr.map_type < 0) { + p_err("unrecognized map type: %s", *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "name")) { + NEXT_ARG(); + attr.name = GET_ARG(); + } else if (is_prefix(*argv, "key")) { + if (parse_u32_arg(&argc, &argv, &attr.key_size, + "key size")) + return -1; + } else if (is_prefix(*argv, "value")) { + if (parse_u32_arg(&argc, &argv, &attr.value_size, + "value size")) + return -1; + } else if (is_prefix(*argv, "entries")) { + if (parse_u32_arg(&argc, &argv, &attr.max_entries, + "max entries")) + return -1; + } else if (is_prefix(*argv, "flags")) { + if (parse_u32_arg(&argc, &argv, &attr.map_flags, + "flags")) + return -1; + } else if (is_prefix(*argv, "dev")) { + NEXT_ARG(); + + if (attr.map_ifindex) { + p_err("offload device already specified"); + return -1; + } + + attr.map_ifindex = if_nametoindex(*argv); + if (!attr.map_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + return -1; + } + NEXT_ARG(); + } + } + + if (!attr.name) { + p_err("map name not specified"); + return -1; + } + + fd = bpf_create_map_xattr(&attr); + if (fd < 0) { + p_err("map create failed: %s", strerror(errno)); + return -1; + } + + err = do_pin_fd(fd, pinfile); + close(fd); + if (err) + return err; + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1054,6 +1152,9 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" + " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [dev NAME]\n" " %s %s dump MAP\n" " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" " %s %s lookup MAP key DATA\n" @@ -1068,11 +1169,17 @@ static int do_help(int argc, char **argv) " " HELP_SPEC_PROGRAM "\n" " VALUE := { DATA | MAP | PROG }\n" " UPDATE_FLAGS := { any | exist | noexist }\n" + " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n" + " percpu_array | stack_trace | cgroup_array | lru_hash |\n" + " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" + " devmap | sockmap | cpumap | xskmap | sockhash |\n" + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2]); return 0; } @@ -1088,6 +1195,7 @@ static const struct cmd cmds[] = { { "delete", do_delete }, { "pin", do_pin }, { "event_pipe", do_event_pipe }, + { "create", do_create }, { 0 } }; -- Gitee From 091d332217f4aad0241dc9e7b510ca32a5e5272c Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 19 Jun 2023 15:31:35 +0800 Subject: [PATCH 22/39] tools: bpftool: fix infinite loop in map create ANBZ: #5530 commit 8694d8c1f82cccec9380e0d3720b84eee315dfb7 upstream. "bpftool map create" has an infinite loop on "while (argc)". The error case is missing. Symptoms: when forgetting to type the keyword 'type' in front of 'hash': $ sudo bpftool map create /sys/fs/bpf/dir/foobar hash key 8 value 8 entries 128 (infinite loop, taking all the CPU) ^C After the patch: $ sudo bpftool map create /sys/fs/bpf/dir/foobar hash key 8 value 8 entries 128 Error: unknown arg hash Fixes: 0b592b5a01be ("tools: bpftool: add map create command") Signed-off-by: Alban Crequy Reviewed-by: Quentin Monnet Acked-by: Song Liu Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/bpf/bpftool/map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 4d8824304d0b..c5e2c5ef6acb 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1119,6 +1119,9 @@ static int do_create(int argc, char **argv) return -1; } NEXT_ARG(); + } else { + p_err("unknown arg %s", *argv); + return -1; } } -- Gitee From bb9b02b25ffccb943a89028ffea88405b8ca2a57 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Tue, 20 Jun 2023 09:40:16 +0800 Subject: [PATCH 23/39] libbpf: Per-symbol visibility for DSO ANBZ: #5530 commit ab9e084821221b2eda57a512535fe35b49e672d8 upstream. Make global symbols in libbpf DSO hidden by default with -fvisibility=hidden and export symbols that are part of ABI explicitly with __attribute__((visibility("default"))). This is common practice that should prevent from accidentally exporting a symbol, that is not supposed to be a part of ABI what, in turn, improves both libbpf developer- and user-experiences. See [1] for more details. Export control becomes more important since more and more projects use libbpf. The patch doesn't export a bunch of netlink related functions since as agreed in [2] they'll be reworked. That doesn't break bpftool since bpftool links libbpf statically. [1] https://www.akkadia.org/drepper/dsohowto.pdf (2.2 Export Control) [2] https://www.mail-archive.com/netdev@vger.kernel.org/msg251434.html Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/lib/bpf/Makefile | 1 + tools/lib/bpf/bpf.h | 118 ++++++++++++++------------ tools/lib/bpf/btf.h | 22 +++-- tools/lib/bpf/libbpf.h | 186 ++++++++++++++++++++++------------------- 4 files changed, 179 insertions(+), 148 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index b5ff55650724..453df2623751 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -125,6 +125,7 @@ override CFLAGS += $(EXTRA_WARNINGS) override CFLAGS += -Werror -Wall override CFLAGS += -fPIC override CFLAGS += $(INCLUDES) +override CFLAGS += -fvisibility=hidden ifeq ($(VERBOSE),1) Q = diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 20380ecac9e8..b050c023bfbe 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -28,6 +28,10 @@ #include #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + struct bpf_create_map_attr { const char *name; enum bpf_map_type map_type; @@ -43,21 +47,24 @@ struct bpf_create_map_attr { __u32 inner_map_fd; }; -int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); -int bpf_create_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags, int node); -int bpf_create_map_name(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags); -int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, - int max_entries, __u32 map_flags); -int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags, int node); -int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags); +LIBBPF_API int +bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); +LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, __u32 map_flags, int node); +LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, __u32 map_flags); +LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries, __u32 map_flags); +LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type, + const char *name, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags, int node); +LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type, + const char *name, int key_size, + int inner_map_fd, int max_entries, + __u32 map_flags); struct bpf_load_program_attr { enum bpf_prog_type prog_type; @@ -75,44 +82,49 @@ struct bpf_load_program_attr { /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE (256 * 1024) -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz); -int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, int strict_alignment, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, int log_level); +LIBBPF_API int +bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz); +LIBBPF_API int bpf_load_program(enum bpf_prog_type type, + const struct bpf_insn *insns, size_t insns_cnt, + const char *license, __u32 kern_version, + char *log_buf, size_t log_buf_sz); +LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, + const struct bpf_insn *insns, + size_t insns_cnt, int strict_alignment, + const char *license, __u32 kern_version, + char *log_buf, size_t log_buf_sz, + int log_level); -int bpf_map_update_elem(int fd, const void *key, const void *value, - __u64 flags); +LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); -int bpf_map_lookup_elem(int fd, const void *key, void *value); -int bpf_map_delete_elem(int fd, const void *key); -int bpf_map_get_next_key(int fd, const void *key, void *next_key); -int bpf_obj_pin(int fd, const char *pathname); -int bpf_obj_get(const char *pathname); -int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, - unsigned int flags); -int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); -int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration); -int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); -int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); -int bpf_prog_get_fd_by_id(__u32 id); -int bpf_map_get_fd_by_id(__u32 id); -int bpf_btf_get_fd_by_id(__u32 id); -int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); -int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, - __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); -int bpf_raw_tracepoint_open(const char *name, int prog_fd); -int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, - bool do_log); -int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, - __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, - __u64 *probe_addr); +LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, + enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, + enum bpf_attach_type type); +LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, + __u32 size, void *data_out, __u32 *size_out, + __u32 *retval, __u32 *duration); +LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); +LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, + __u32 query_flags, __u32 *attach_flags, + __u32 *prog_ids, __u32 *prog_cnt); +LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); +LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, + __u32 log_buf_size, bool do_log); +LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, + __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr); #endif /* __LIBBPF_BPF_H */ diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 6db5462bb2ef..b77e7080f7e7 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -6,6 +6,10 @@ #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + #define BTF_ELF_SEC ".BTF" struct btf; @@ -14,13 +18,15 @@ struct btf_type; typedef int (*btf_print_fn_t)(const char *, ...) __attribute__((format(printf, 1, 2))); -void btf__free(struct btf *btf); -struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log); -__s32 btf__find_by_name(const struct btf *btf, const char *type_name); -const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); -__s64 btf__resolve_size(const struct btf *btf, __u32 type_id); -int btf__resolve_type(const struct btf *btf, __u32 type_id); -int btf__fd(const struct btf *btf); -const char *btf__name_by_offset(const struct btf *btf, __u32 offset); +LIBBPF_API void btf__free(struct btf *btf); +LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log); +LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, + const char *type_name); +LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, + __u32 id); +LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); #endif /* __LIBBPF_BTF_H */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7e9c801a9fdd..1354cc9f8cba 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -16,6 +16,10 @@ #include // for size_t #include +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + enum libbpf_errno { __LIBBPF_ERRNO__START = 4000, @@ -37,7 +41,7 @@ enum libbpf_errno { __LIBBPF_ERRNO__END, }; -int libbpf_strerror(int err, char *buf, size_t size); +LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); /* * __printf is defined in include/linux/compiler-gcc.h. However, @@ -47,9 +51,9 @@ int libbpf_strerror(int err, char *buf, size_t size); typedef int (*libbpf_print_fn_t)(const char *, ...) __attribute__((format(printf, 1, 2))); -void libbpf_set_print(libbpf_print_fn_t warn, - libbpf_print_fn_t info, - libbpf_print_fn_t debug); +LIBBPF_API void libbpf_set_print(libbpf_print_fn_t warn, + libbpf_print_fn_t info, + libbpf_print_fn_t debug); /* Hide internal to user */ struct bpf_object; @@ -59,27 +63,28 @@ struct bpf_object_open_attr { enum bpf_prog_type prog_type; }; -struct bpf_object *bpf_object__open(const char *path); -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr); +LIBBPF_API struct bpf_object *bpf_object__open(const char *path); +LIBBPF_API struct bpf_object * +bpf_object__open_xattr(struct bpf_object_open_attr *attr); struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags); -struct bpf_object *bpf_object__open_buffer(void *obj_buf, - size_t obj_buf_sz, - const char *name); -int bpf_object__pin(struct bpf_object *object, const char *path); -void bpf_object__close(struct bpf_object *object); +LIBBPF_API struct bpf_object *bpf_object__open_buffer(void *obj_buf, + size_t obj_buf_sz, + const char *name); +LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); +LIBBPF_API void bpf_object__close(struct bpf_object *object); /* Load/unload object into/from kernel */ -int bpf_object__load(struct bpf_object *obj); -int bpf_object__unload(struct bpf_object *obj); -const char *bpf_object__name(struct bpf_object *obj); -unsigned int bpf_object__kversion(struct bpf_object *obj); -int bpf_object__btf_fd(const struct bpf_object *obj); +LIBBPF_API int bpf_object__load(struct bpf_object *obj); +LIBBPF_API int bpf_object__unload(struct bpf_object *obj); +LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); +LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); +LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); -struct bpf_program * +LIBBPF_API struct bpf_program * bpf_object__find_program_by_title(struct bpf_object *obj, const char *title); -struct bpf_object *bpf_object__next(struct bpf_object *prev); +LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp) \ for ((pos) = bpf_object__next(NULL), \ (tmp) = bpf_object__next(pos); \ @@ -87,19 +92,20 @@ struct bpf_object *bpf_object__next(struct bpf_object *prev); (pos) = (tmp), (tmp) = bpf_object__next(tmp)) typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); -int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv); -void *bpf_object__priv(struct bpf_object *prog); +LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, + bpf_object_clear_priv_t clear_priv); +LIBBPF_API void *bpf_object__priv(struct bpf_object *prog); -int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, - enum bpf_attach_type *expected_attach_type); -int libbpf_attach_type_by_name(const char *name, - enum bpf_attach_type *attach_type); +LIBBPF_API int +libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type); +LIBBPF_API int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type); /* Accessors of bpf_program */ struct bpf_program; -struct bpf_program *bpf_program__next(struct bpf_program *prog, - struct bpf_object *obj); +LIBBPF_API struct bpf_program *bpf_program__next(struct bpf_program *prog, + struct bpf_object *obj); #define bpf_object__for_each_program(pos, obj) \ for ((pos) = bpf_program__next(NULL, (obj)); \ @@ -109,21 +115,24 @@ struct bpf_program *bpf_program__next(struct bpf_program *prog, typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); -int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv); +LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, + bpf_program_clear_priv_t clear_priv); -void *bpf_program__priv(struct bpf_program *prog); -void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); +LIBBPF_API void *bpf_program__priv(struct bpf_program *prog); +LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, + __u32 ifindex); -const char *bpf_program__title(struct bpf_program *prog, bool needs_copy); +LIBBPF_API const char *bpf_program__title(struct bpf_program *prog, + bool needs_copy); -int bpf_program__load(struct bpf_program *prog, char *license, - __u32 kern_version); -int bpf_program__fd(struct bpf_program *prog); -int bpf_program__pin_instance(struct bpf_program *prog, const char *path, - int instance); -int bpf_program__pin(struct bpf_program *prog, const char *path); -void bpf_program__unload(struct bpf_program *prog); +LIBBPF_API int bpf_program__load(struct bpf_program *prog, char *license, + __u32 kern_version); +LIBBPF_API int bpf_program__fd(struct bpf_program *prog); +LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, + const char *path, + int instance); +LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); +LIBBPF_API void bpf_program__unload(struct bpf_program *prog); struct bpf_insn; @@ -184,34 +193,36 @@ typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, struct bpf_insn *insns, int insns_cnt, struct bpf_prog_prep_result *res); -int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, - bpf_program_prep_t prep); +LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, + bpf_program_prep_t prep); -int bpf_program__nth_fd(struct bpf_program *prog, int n); +LIBBPF_API int bpf_program__nth_fd(struct bpf_program *prog, int n); /* * Adjust type of BPF program. Default is kprobe. */ -int bpf_program__set_socket_filter(struct bpf_program *prog); -int bpf_program__set_tracepoint(struct bpf_program *prog); -int bpf_program__set_raw_tracepoint(struct bpf_program *prog); -int bpf_program__set_kprobe(struct bpf_program *prog); -int bpf_program__set_sched_cls(struct bpf_program *prog); -int bpf_program__set_sched_act(struct bpf_program *prog); -int bpf_program__set_xdp(struct bpf_program *prog); -int bpf_program__set_perf_event(struct bpf_program *prog); -void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); -void bpf_program__set_expected_attach_type(struct bpf_program *prog, - enum bpf_attach_type type); - -bool bpf_program__is_socket_filter(struct bpf_program *prog); -bool bpf_program__is_tracepoint(struct bpf_program *prog); -bool bpf_program__is_raw_tracepoint(struct bpf_program *prog); -bool bpf_program__is_kprobe(struct bpf_program *prog); -bool bpf_program__is_sched_cls(struct bpf_program *prog); -bool bpf_program__is_sched_act(struct bpf_program *prog); -bool bpf_program__is_xdp(struct bpf_program *prog); -bool bpf_program__is_perf_event(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); +LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, + enum bpf_prog_type type); +LIBBPF_API void +bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type); + +LIBBPF_API bool bpf_program__is_socket_filter(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_tracepoint(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_raw_tracepoint(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_kprobe(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched_cls(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched_act(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_xdp(struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_perf_event(struct bpf_program *prog); /* * No need for __attribute__((packed)), all members of 'bpf_map_def' @@ -232,39 +243,39 @@ struct bpf_map_def { * so no need to worry about a name clash. */ struct bpf_map; -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_object__find_map_by_name(struct bpf_object *obj, const char *name); /* * Get bpf_map through the offset of corresponding struct bpf_map_def * in the BPF object file. */ -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); -struct bpf_map * +LIBBPF_API struct bpf_map * bpf_map__next(struct bpf_map *map, struct bpf_object *obj); #define bpf_map__for_each(pos, obj) \ for ((pos) = bpf_map__next(NULL, (obj)); \ (pos) != NULL; \ (pos) = bpf_map__next((pos), (obj))) -int bpf_map__fd(struct bpf_map *map); -const struct bpf_map_def *bpf_map__def(struct bpf_map *map); -const char *bpf_map__name(struct bpf_map *map); -__u32 bpf_map__btf_key_type_id(const struct bpf_map *map); -__u32 bpf_map__btf_value_type_id(const struct bpf_map *map); +LIBBPF_API int bpf_map__fd(struct bpf_map *map); +LIBBPF_API const struct bpf_map_def *bpf_map__def(struct bpf_map *map); +LIBBPF_API const char *bpf_map__name(struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); -int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv); -void *bpf_map__priv(struct bpf_map *map); -int bpf_map__reuse_fd(struct bpf_map *map, int fd); -bool bpf_map__is_offload_neutral(struct bpf_map *map); -void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); -int bpf_map__pin(struct bpf_map *map, const char *path); +LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, + bpf_map_clear_priv_t clear_priv); +LIBBPF_API void *bpf_map__priv(struct bpf_map *map); +LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map); +LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); -long libbpf_get_error(const void *ptr); +LIBBPF_API long libbpf_get_error(const void *ptr); struct bpf_prog_load_attr { const char *file; @@ -273,12 +284,12 @@ struct bpf_prog_load_attr { int ifindex; }; -int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); -int bpf_prog_load(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd); +LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, + struct bpf_object **pobj, int *prog_fd); +LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type, + struct bpf_object **pobj, int *prog_fd); -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); +LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, @@ -288,10 +299,11 @@ enum bpf_perf_event_ret { typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(void *event, void *priv); -int bpf_perf_event_read_simple(void *mem, unsigned long size, - unsigned long page_size, - void **buf, size_t *buf_len, - bpf_perf_event_print_t fn, void *priv); +LIBBPF_API int bpf_perf_event_read_simple(void *mem, unsigned long size, + unsigned long page_size, + void **buf, size_t *buf_len, + bpf_perf_event_print_t fn, + void *priv); struct nlattr; typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); -- Gitee From 61119248247416b0589b62f221efab98c8e4e534 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 19 Jun 2023 15:52:33 +0800 Subject: [PATCH 24/39] bpf: fix doc of bpf_skb_adjust_room() in uapi ANBZ: #5530 commit b55cbc8d9b44aaee94f19e995a5f241d453763ee upstream. len_diff is signed. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") CC: Quentin Monnet Signed-off-by: Nicolas Dichtel Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e9d5c881eebd..af27ca7c6e3e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1435,7 +1435,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1f5c94f443ca..9e5ee7fa9391 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. -- Gitee From 30f1db83e0474d3f4aea582639acad9b13c18b2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Jun 2023 15:55:48 +0800 Subject: [PATCH 25/39] tools: bpftool: use 4 context mode for the NFP disasm ANBZ: #5530 commit 3ddeac6705aba31b7528c7d7a528eabb74475622 upstream. The nfp driver is currently always JITing the BPF for 4 context/thread mode of the NFP flow processors. Tell this to the disassembler, otherwise some registers may be incorrectly decoded. Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/bpf/bpftool/common.c | 5 ++++- tools/bpf/bpftool/jit_disasm.c | 4 +++- tools/bpf/bpftool/main.h | 6 ++++-- tools/bpf/bpftool/prog.c | 14 +++++++++----- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index aeb4b7035151..e7bba3b7aaca 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -555,7 +555,9 @@ static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name) return read_sysfs_hex_int(full_path); } -const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) +const char * +ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, + const char **opt) { char devname[IF_NAMESIZE]; int vendor_id; @@ -580,6 +582,7 @@ const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino) device_id != 0x6000 && device_id != 0x6003) p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch"); + *opt = "ctx4"; return "NFP-6xxx"; default: p_err("Can't get bfd arch name for device vendor id 0x%04x", diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index 73d7252729fa..b61f6b475e95 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -80,7 +80,7 @@ static int fprintf_json(void *out, const char *fmt, ...) } void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, - const char *arch) + const char *arch, const char *disassembler_options) { disassembler_ftype disassemble; struct disassemble_info info; @@ -119,6 +119,8 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, info.arch = bfd_get_arch(bfdf); info.mach = bfd_get_mach(bfdf); + if (disassembler_options) + info.disassembler_options = disassembler_options; info.buffer = image; info.buffer_length = len; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 49266f93b856..a8bf1e2d9818 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -145,13 +145,15 @@ int map_parse_fd(int *argc, char ***argv); int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, - const char *arch); + const char *arch, const char *disassembler_options); void print_data_json(uint8_t *data, size_t len); void print_hex_data_json(uint8_t *data, size_t len); unsigned int get_page_size(void); unsigned int get_possible_cpus(void); -const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); +const char * +ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, + const char **opt); struct btf_dumper { const struct btf *btf; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 70f35e9296b2..3a2bac48a965 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -452,6 +452,7 @@ static int do_dump(int argc, char **argv) unsigned long *func_ksyms = NULL; struct bpf_prog_info info = {}; unsigned int *func_lens = NULL; + const char *disasm_opt = NULL; unsigned int nr_func_ksyms; unsigned int nr_func_lens; struct dump_data dd = {}; @@ -610,9 +611,10 @@ static int do_dump(int argc, char **argv) const char *name = NULL; if (info.ifindex) { - name = ifindex_to_bfd_name_ns(info.ifindex, - info.netns_dev, - info.netns_ino); + name = ifindex_to_bfd_params(info.ifindex, + info.netns_dev, + info.netns_ino, + &disasm_opt); if (!name) goto err_free; } @@ -654,7 +656,8 @@ static int do_dump(int argc, char **argv) printf("%s:\n", sym_name); } - disasm_print_insn(img, lens[i], opcodes, name); + disasm_print_insn(img, lens[i], opcodes, name, + disasm_opt); img += lens[i]; if (json_output) @@ -666,7 +669,8 @@ static int do_dump(int argc, char **argv) if (json_output) jsonw_end_array(json_wtr); } else { - disasm_print_insn(buf, *member_len, opcodes, name); + disasm_print_insn(buf, *member_len, opcodes, name, + disasm_opt); } } else if (visual) { if (json_output) -- Gitee From 0b3e6ee29d8de97980aac1bf3f4614da28b9fb03 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 16:18:21 +0800 Subject: [PATCH 26/39] bpf: rename stack trace map operations ANBZ: #5530 commit 144991602e6a14d667b295f1b099e609ce857772 upstream. In the following patches queue and stack maps (FIFO and LIFO datastructures) will be implemented. In order to avoid confusion and a possible name clash rename stack_map_ops to stack_trace_map_ops Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/linux/bpf_types.h | 2 +- kernel/bpf/stackmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa48343a5ea1..7bad4e1947ed 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -51,7 +51,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops) #ifdef CONFIG_PERF_EVENTS -BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c0f1152bc27b..b036849f4adc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -613,7 +613,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, -- Gitee From d21db0136597745510cb9632e3a1503c0ea540c9 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 16:27:57 +0800 Subject: [PATCH 27/39] bpf/syscall: allow key to be null in map functions ANBZ: #5530 commit c9d29f4658a5a6d2c2ba2afeb20ff763fc6286f9 upstream. This commit adds the required logic to allow key being NULL in case the key_size of the map is 0. A new __bpf_copy_key function helper only copies the key from userpsace when key_size != 0, otherwise it enforces that key must be null. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/syscall.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfd43a3126..84dd9a8dd583 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -652,6 +652,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } +static void *__bpf_copy_key(void __user *ukey, u64 key_size) +{ + if (key_size) + return memdup_user(ukey, key_size); + + if (ukey) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -679,7 +690,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -798,7 +809,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -901,7 +912,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -954,7 +965,7 @@ static int map_get_next_key(union bpf_attr *attr) } if (ukey) { - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; -- Gitee From 524346e1b5f749d8fd380203fa6f4d08e2a9678c Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 16:48:54 +0800 Subject: [PATCH 28/39] bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE ANBZ: #5530 commit 2ea864c58f19bf70a0e2415f9f1c53814e07f1b4 upstream. ARG_PTR_TO_UNINIT_MAP_VALUE argument is a pointer to a memory zone used to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. This will be used in the following patch that implements some new helpers that receive a pointer to be filled with a map value. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bcb4d59d97bc..48db5c3f7edc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -144,6 +144,7 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0a1e040edb4..e187e6b0780e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2250,7 +2250,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE) { + arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) @@ -2320,7 +2321,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -2329,9 +2331,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, - NULL); + meta); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); -- Gitee From 4e80e42972b26a7b39ae97119e8ed05b8452a0dd Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 17:20:10 +0800 Subject: [PATCH 29/39] bpf: add queue and stack maps ANBZ: #5530 commit f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92 upstream. Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs. These maps support peek, pop and push operations that are exposed to eBPF programs through the new bpf_map[peek/pop/push] helpers. Those operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop BPF_MAP_UPDATE_ELEM -> push Queue/stack maps are implemented using a buffer, tail and head indexes, hence BPF_F_NO_PREALLOC is not supported. As opposite to other maps, queue and stack do not use RCU for protecting maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE argument that is a pointer to a memory zone where to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. Our main motivation for implementing queue/stack maps was to keep track of a pool of elements, like network ports in a SNAT, however we forsee other use cases, like for exampling saving last N kernel events in a map and then analysing from userspace. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/linux/bpf.h | 6 + include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 29 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 3 + kernel/bpf/helpers.c | 43 +++++ kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + kernel/bpf/verifier.c | 19 ++- net/core/filter.c | 6 + 10 files changed, 401 insertions(+), 3 deletions(-) create mode 100644 kernel/bpf/queue_stack_maps.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48db5c3f7edc..7109dbf297f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -40,6 +40,9 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + int (*map_pop_elem)(struct bpf_map *map, void *value); + int (*map_peek_elem)(struct bpf_map *map, void *value); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -834,6 +837,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_map_push_elem_proto; +extern const struct bpf_func_proto bpf_map_pop_elem_proto; +extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 7bad4e1947ed..44d9ab4809bd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index af27ca7c6e3e..47a10ec97763 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -128,6 +128,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -464,6 +466,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2312,7 +2336,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 47bbc691c983..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2dc67936b82b..c256fc1fcfe6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1853,6 +1853,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6502115e8f55..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include +#include +#include +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 84dd9a8dd583..94dfc14d6074 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -733,6 +733,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -870,6 +873,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e187e6b0780e..9b840ff1d1c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2456,6 +2456,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2512,6 +2519,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2828,7 +2842,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { diff --git a/net/core/filter.c b/net/core/filter.c index aab7889860c2..12bfcae4920d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4856,6 +4856,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: -- Gitee From a9c8162e830a6c690c1613267c5e40ff1d693d6d Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 17:43:45 +0800 Subject: [PATCH 30/39] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall ANBZ: #5530 commit bd513cd08f10cbe28856f99ae951e86e86803861 upstream. The previous patch implemented a bpf queue/stack maps that provided the peek/pop/push functions. There is not a direct relationship between those functions and the current maps syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, this is mapped to the pop operation in the queue/stack maps and it is still to implement in other kind of maps. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 47a10ec97763..7509c8f2ae0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 94dfc14d6074..afa4c1d3c3d4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1012,6 +1012,69 @@ static int map_get_next_key(union bpf_attr *attr) return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value, *ptr; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2501,6 +2564,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; -- Gitee From 096e0f201a4d3a9fa59583d98dfe6b161807a849 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 19 Jun 2023 17:46:33 +0800 Subject: [PATCH 31/39] Sync uapi/bpf.h to tools/include ANBZ: #5530 commit da4e1b15f67613eca7ae998abd996cfee7cff1ba upstream. Sync both files. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/include/uapi/linux/bpf.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9e5ee7fa9391..e914942334d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { @@ -128,6 +129,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -462,6 +465,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2311,7 +2336,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- Gitee From 31745878dab20b89ae1150a463ba5f1d9339c4ed Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 13 Jul 2023 10:12:52 +0800 Subject: [PATCH 32/39] selftests/bpf: add test cases for queue and stack ANBZ: #5530 commit 43b987d23d6bd08db41a9c4a85aacfb3f0b2a94c upstream. test_maps: Tests that queue/stack maps are behaving correctly even in corner cases test_progs: Tests new ebpf helpers Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- tools/lib/bpf/bpf.c | 12 ++ tools/lib/bpf/bpf.h | 2 + tools/testing/selftests/bpf/Makefile | 6 +- tools/testing/selftests/bpf/bpf_helpers.h | 7 + tools/testing/selftests/bpf/test_maps.c | 122 ++++++++++++++++++ tools/testing/selftests/bpf/test_progs.c | 99 ++++++++++++++ tools/testing/selftests/bpf/test_queue_map.c | 4 + .../selftests/bpf/test_queue_stack_map.h | 59 +++++++++ tools/testing/selftests/bpf/test_stack_map.c | 4 + 9 files changed, 314 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/test_queue_map.c create mode 100644 tools/testing/selftests/bpf/test_queue_stack_map.h create mode 100644 tools/testing/selftests/bpf/test_stack_map.c diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index f1ba022fd659..bd003cc91f94 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -300,6 +300,18 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value) return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } +int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) +{ + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); +} + int bpf_map_delete_elem(int fd, const void *key) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index b050c023bfbe..7ddc8a2d1981 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -100,6 +100,8 @@ LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, + void *value); LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1dec3ab2fcde..cb0a9e90b1f8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o + test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o \ + test_queue_map.o test_stack_map.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -117,6 +118,9 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline +$(OUTPUT)/test_queue_map.o: test_queue_stack_map.h +$(OUTPUT)/test_stack_map.o: test_queue_stack_map.h + BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 8962331ed05f..f51da393dc22 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -16,6 +16,13 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value, (void *) BPF_FUNC_map_update_elem; static int (*bpf_map_delete_elem)(void *map, void *key) = (void *) BPF_FUNC_map_delete_elem; +static int (*bpf_map_push_elem)(void *map, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_push_elem; +static int (*bpf_map_pop_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_pop_elem; +static int (*bpf_map_peek_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_peek_elem; static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = (void *) BPF_FUNC_probe_read; static unsigned long long (*bpf_ktime_get_ns)(void) = diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index e5e62cb57ff7..7cab5d743dfc 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -471,6 +472,122 @@ static void test_devmap(int task, void *data) close(fd); } +static void test_queuemap(int task, void *data) +{ + const int MAP_SIZE = 32; + __u32 vals[MAP_SIZE + MAP_SIZE/2], val; + int fd, i; + + /* Fill test values to be used */ + for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++) + vals[i] = rand(); + + /* Invalid key size */ + fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 4, sizeof(val), MAP_SIZE, + map_flags); + assert(fd < 0 && errno == EINVAL); + + fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, sizeof(val), MAP_SIZE, + map_flags); + /* Queue map does not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create queuemap '%s'!\n", strerror(errno)); + exit(1); + } + + /* Push MAP_SIZE elements */ + for (i = 0; i < MAP_SIZE; i++) + assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); + + /* Check that element cannot be pushed due to max_entries limit */ + assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 && + errno == E2BIG); + + /* Peek element */ + assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[0]); + + /* Replace half elements */ + for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++) + assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0); + + /* Pop all elements */ + for (i = MAP_SIZE/2; i < MAP_SIZE + MAP_SIZE/2; i++) + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 && + val == vals[i]); + + /* Check that there are not elements left */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 && + errno == ENOENT); + + /* Check that non supported functions set errno to EINVAL */ + assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL); + assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL); + + close(fd); +} + +static void test_stackmap(int task, void *data) +{ + const int MAP_SIZE = 32; + __u32 vals[MAP_SIZE + MAP_SIZE/2], val; + int fd, i; + + /* Fill test values to be used */ + for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++) + vals[i] = rand(); + + /* Invalid key size */ + fd = bpf_create_map(BPF_MAP_TYPE_STACK, 4, sizeof(val), MAP_SIZE, + map_flags); + assert(fd < 0 && errno == EINVAL); + + fd = bpf_create_map(BPF_MAP_TYPE_STACK, 0, sizeof(val), MAP_SIZE, + map_flags); + /* Stack map does not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create stackmap '%s'!\n", strerror(errno)); + exit(1); + } + + /* Push MAP_SIZE elements */ + for (i = 0; i < MAP_SIZE; i++) + assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); + + /* Check that element cannot be pushed due to max_entries limit */ + assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 && + errno == E2BIG); + + /* Peek element */ + assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[i - 1]); + + /* Replace half elements */ + for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++) + assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0); + + /* Pop all elements */ + for (i = MAP_SIZE + MAP_SIZE/2 - 1; i >= MAP_SIZE/2; i--) + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 && + val == vals[i]); + + /* Check that there are not elements left */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 && + errno == ENOENT); + + /* Check that non supported functions set errno to EINVAL */ + assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL); + assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL); + + close(fd); +} + #include #include #include @@ -1434,10 +1551,15 @@ static void run_all_tests(void) test_map_wronly(); test_reuseport_array(); + + test_queuemap(0, NULL); + test_stackmap(0, NULL); } int main(void) { + srand(time(NULL)); + map_flags = 0; run_all_tests(); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index fb540beef324..3ec4ce156074 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1765,8 +1765,105 @@ static void test_reference_tracking() bpf_object__close(obj); } +enum { + QUEUE, + STACK, +}; + +static void test_queue_stack_map(int type) +{ + const int MAP_SIZE = 32; + __u32 vals[MAP_SIZE], duration, retval, size, val; + int i, err, prog_fd, map_in_fd, map_out_fd; + char file[32], buf[128]; + struct bpf_object *obj; + struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); + + /* Fill test values to be used */ + for (i = 0; i < MAP_SIZE; i++) + vals[i] = rand(); + + if (type == QUEUE) + strncpy(file, "./test_queue_map.o", sizeof(file)); + else if (type == STACK) + strncpy(file, "./test_stack_map.o", sizeof(file)); + else + return; + + err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd); + if (err) { + error_cnt++; + return; + } + + map_in_fd = bpf_find_map(__func__, obj, "map_in"); + if (map_in_fd < 0) + goto out; + + map_out_fd = bpf_find_map(__func__, obj, "map_out"); + if (map_out_fd < 0) + goto out; + + /* Push 32 elements to the input map */ + for (i = 0; i < MAP_SIZE; i++) { + err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0); + if (err) { + error_cnt++; + goto out; + } + } + + /* The eBPF program pushes iph.saddr in the output map, + * pops the input map and saves this value in iph.daddr + */ + for (i = 0; i < MAP_SIZE; i++) { + if (type == QUEUE) { + val = vals[i]; + pkt_v4.iph.saddr = vals[i] * 5; + } else if (type == STACK) { + val = vals[MAP_SIZE - 1 - i]; + pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5; + } + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + if (err || retval || size != sizeof(pkt_v4) || + iph->daddr != val) + break; + } + + CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val, + "bpf_map_pop_elem", + "err %d errno %d retval %d size %d iph->daddr %u\n", + err, errno, retval, size, iph->daddr); + + /* Queue is empty, program should return TC_ACT_SHOT */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4), + "check-queue-stack-map-empty", + "err %d errno %d retval %d size %d\n", + err, errno, retval, size); + + /* Check that the program pushed elements correctly */ + for (i = 0; i < MAP_SIZE; i++) { + err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val); + if (err || val != vals[i] * 5) + break; + } + + CHECK(i != MAP_SIZE && (err || val != vals[i] * 5), + "bpf_map_push_elem", "err %d value %u\n", err, val); + +out: + pkt_v4.iph.saddr = 0; + bpf_object__close(obj); +} + int main(void) { + srand(time(NULL)); + jit_enabled = is_jit_enabled(); test_pkt_access(); @@ -1787,6 +1884,8 @@ int main(void) test_task_fd_query_rawtp(); test_task_fd_query_tp(); test_reference_tracking(); + test_queue_stack_map(QUEUE); + test_queue_stack_map(STACK); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_queue_map.c b/tools/testing/selftests/bpf/test_queue_map.c new file mode 100644 index 000000000000..87db1f9da33d --- /dev/null +++ b/tools/testing/selftests/bpf/test_queue_map.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Politecnico di Torino +#define MAP_TYPE BPF_MAP_TYPE_QUEUE +#include "test_queue_stack_map.h" diff --git a/tools/testing/selftests/bpf/test_queue_stack_map.h b/tools/testing/selftests/bpf/test_queue_stack_map.h new file mode 100644 index 000000000000..295b9b3bc5c7 --- /dev/null +++ b/tools/testing/selftests/bpf/test_queue_stack_map.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2018 Politecnico di Torino +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +int _version SEC("version") = 1; + +struct bpf_map_def __attribute__ ((section("maps"), used)) map_in = { + .type = MAP_TYPE, + .key_size = 0, + .value_size = sizeof(__u32), + .max_entries = 32, + .map_flags = 0, +}; + +struct bpf_map_def __attribute__ ((section("maps"), used)) map_out = { + .type = MAP_TYPE, + .key_size = 0, + .value_size = sizeof(__u32), + .max_entries = 32, + .map_flags = 0, +}; + +SEC("test") +int _test(struct __sk_buff *skb) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ethhdr *eth = (struct ethhdr *)(data); + __u32 value; + int err; + + if (eth + 1 > data_end) + return TC_ACT_SHOT; + + struct iphdr *iph = (struct iphdr *)(eth + 1); + + if (iph + 1 > data_end) + return TC_ACT_SHOT; + + err = bpf_map_pop_elem(&map_in, &value); + if (err) + return TC_ACT_SHOT; + + iph->daddr = value; + + err = bpf_map_push_elem(&map_out, &iph->saddr, 0); + if (err) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_stack_map.c b/tools/testing/selftests/bpf/test_stack_map.c new file mode 100644 index 000000000000..31c3880e6da0 --- /dev/null +++ b/tools/testing/selftests/bpf/test_stack_map.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Politecnico di Torino +#define MAP_TYPE BPF_MAP_TYPE_STACK +#include "test_queue_stack_map.h" -- Gitee From cc6f5354288e49b875046f468bf128ff5beb55f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Jun 2023 10:16:24 +0800 Subject: [PATCH 33/39] bpf: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 540fefc08f75aedb517acbf525d393b8efddabd9 upstream. fix the following warning ../kernel/bpf/syscall.c: In function ‘map_lookup_and_delete_elem’: ../kernel/bpf/syscall.c:1010:22: warning: unused variable ‘ptr’ [-Wunused-variable] void *key, *value, *ptr; ^~~ Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index afa4c1d3c3d4..cd1987c80d91 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1020,7 +1020,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; -- Gitee From e5dd3492209369cd395757e566150560187669b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Jun 2023 10:29:25 +0800 Subject: [PATCH 34/39] bpf: fix direct packet write into pop/peek helpers ANBZ: #5530 commit 80b0d86a176cab6201719b8dfd806902b0c6e046 upstream. Commit f1a2e44a3aec ("bpf: add queue and stack maps") probably just copy-pasted .pkt_access for bpf_map_{pop,peek}_elem() helpers, but this is buggy in this context since it would allow writes into cloned skbs which is invalid. Therefore, disable .pkt_access for the two. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/helpers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ab0d5e3f9892..a74972b07e74 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, -- Gitee From df16ee48a2e4757d68733658ea0d6bad1d8497f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Jun 2023 10:22:01 +0800 Subject: [PATCH 35/39] bpf: fix leaking uninitialized memory on pop/peek helpers ANBZ: #5530 commit d3f66e4116aff8dd0d5bd4067295b9ddb5e2c29c upstream. Commit f1a2e44a3aec ("bpf: add queue and stack maps") added helpers with ARG_PTR_TO_UNINIT_MAP_VALUE. Meaning, the helper is supposed to fill the map value buffer with data instead of reading from it like in other helpers such as map update. However, given the buffer is allowed to be uninitialized (since we fill it in the helper anyway), it also means that the helper is obliged to wipe the memory in case of an error in order to not allow for leaking uninitialized memory. Given pop/peek is both handled inside __{stack,queue}_map_get(), lets wipe it there on error case, that is, empty stack/queue. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/queue_stack_maps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..8bbd72d3a121 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } -- Gitee From 2b3fef9bcacfb6b6b94a14918bdf200298386e41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Jun 2023 10:40:33 +0800 Subject: [PATCH 36/39] bpf: fix integer overflow in queue_stack_map ANBZ: #5530 commit 813961de3ee6474dd5703e883471fd941d6c8f69 upstream. Fix the following issues: - allow queue_stack_map for root only - fix u32 max_entries overflow - disallow value_size == 0 Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Wei Wu Signed-off-by: Alexei Starovoitov Cc: Mauricio Vasquez B Signed-off-by: Daniel Borkmann Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/queue_stack_maps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); -- Gitee From a8bca9ed43f23a2e219854b7815e136273c61694 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Sun, 14 Apr 2019 18:58:46 +0200 Subject: [PATCH 37/39] bpf: add map helper functions push, pop, peek in more BPF programs ANBZ: #5530 commit 02a8c817a31606b6b37c2b755f6569903f44241e upstream. commit f1a2e44a3aec ("bpf: add queue and stack maps") introduced new BPF helper functions: - BPF_FUNC_map_push_elem - BPF_FUNC_map_pop_elem - BPF_FUNC_map_peek_elem but they were made available only for network BPF programs. This patch makes them available for tracepoint, cgroup and lirc programs. Signed-off-by: Alban Crequy Cc: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- drivers/media/rc/bpf-lirc.c | 6 ++++++ kernel/bpf/cgroup.c | 6 ++++++ kernel/trace/bpf_trace.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 8b97fd1f0cea..28be17ce53c1 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -73,6 +73,12 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_tail_call: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 897a9505bb76..6f1734e82901 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -703,6 +703,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 83c4e76f513a..3fe987fae55d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -557,6 +557,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: -- Gitee From 4e0be99b0d17011454e98bb299b30e772d53b4e9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 20 Jun 2023 10:48:19 +0800 Subject: [PATCH 38/39] bpf: Fix error return code in map_lookup_and_delete_elem() ANBZ: #5530 commit 7f645462ca01d01abb94d75e6768c8b3ed3a188b upstream. Fix to return negative error code -EFAULT from the copy_to_user() error handling case instead of 0, as done elsewhere in this function. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430081851.166996-1-weiyongjun1@huawei.com Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd1987c80d91..87f3b28cc40e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1061,8 +1061,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (err) goto free_value; - if (copy_to_user(uvalue, value, value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) { + err = -EFAULT; goto free_value; + } err = 0; -- Gitee From d58b3a4f1b795019424720517fe614be1299ffa7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Jun 2023 10:16:24 +0800 Subject: [PATCH 39/39] bpf: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #5530 commit 540fefc08f75aedb517acbf525d393b8efddabd9 upstream. fix the following warning ../kernel/bpf/syscall.c: In function ‘map_lookup_and_delete_elem’: ../kernel/bpf/syscall.c:1010:22: warning: unused variable ‘ptr’ [-Wunused-variable] void *key, *value, *ptr; ^~~ Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Alexei Starovoitov Signed-off-by: Xuezhi Zhao Acked-by: Tianchen Ding Reviewed-by: Yuanhe Shu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/2035 --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..f6f0d6d272e8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -110,7 +110,7 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) } const struct bpf_func_proto bpf_map_peek_elem_proto = { - .func = bpf_map_pop_elem, + .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, -- Gitee