diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df610826c262fadadfb5133ada318d2c2bd..d033d3f92d6d539baecb702da1cf378e9c0245aa 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6831856daedb3275e7d0a841209a0..ff3ab771e76970ae84e1cee3a189f273421471aa 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e4cdd75b7436ea5ef19ff2cfa0644..1a8ec3838c9b9e009c4e423a0bceb8cd80296173 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 +#define SO_NETNS_COOKIE 0x4045 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7ca142155666008eebaaf44262f541..08f9bbbf5bf2b3ddc0f09c7bc4fcdcb6015597ad 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 +#define SO_NETNS_COOKIE 0x0050 + #if !defined(__KERNEL__) diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3c9d3d4e3964c9316db1ee4b1dd00dfcd15f28e6..6cc185f7b8e90bb9fa5aa1d147f1ba177f675a93 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1927,6 +1927,7 @@ CONFIG_PAGE_POOL=y # CONFIG_PAGE_POOL_STATS is not set CONFIG_FAILOVER=m CONFIG_ETHTOOL_NETLINK=y +# CONFIG_BPF_NET_GLOBAL_PROG is not set CONFIG_HAVE_EBPF_JIT=y # diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5732b485c53991f6c92322cd0a788e83d08b2349..57954e35fd36e970ae3258eb5855643d690e196d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -81,6 +81,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, void *, void *) #endif /* CONFIG_BPF_SCHED */ +#ifdef CONFIG_BPF_NET_GLOBAL_PROG +BPF_PROG_TYPE(BPF_PROG_TYPE_NET_GLOBAL, bpf_gnet, + struct bpf_gnet_ctx, struct bpf_gnet_ctx) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4479a49a4f7ce7a779ae68ca9921bfd672c8561e..2a0c2dd475998e77a463cf3970cf6af59b42605d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1474,4 +1474,72 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ +#ifdef CONFIG_BPF_NET_GLOBAL_PROG +struct bpf_gnet_ctx_kern { + struct sock *sk; + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; +}; + +enum gnet_bpf_attach_type { + GNET_BPF_ATTACH_TYPE_INVALID = -1, + GNET_TCP_RECVMSG = 0, + GNET_SK_DST_SET, + GNET_RCV_NIC_NODE, + GNET_SEND_NIC_NODE, + MAX_GNET_BPF_ATTACH_TYPE +}; + +static inline enum gnet_bpf_attach_type +to_gnet_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + case BPF_GNET_TCP_RECVMSG: + return GNET_TCP_RECVMSG; + case BPF_GNET_SK_DST_SET: + return GNET_SK_DST_SET; + case BPF_GNET_RCV_NIC_NODE: + return GNET_RCV_NIC_NODE; + case BPF_GNET_SEND_NIC_NODE: + return GNET_SEND_NIC_NODE; + default: + return GNET_BPF_ATTACH_TYPE_INVALID; + } +} + +struct gnet_bpf { + struct bpf_prog __rcu *progs[MAX_GNET_BPF_ATTACH_TYPE]; + u32 flags[MAX_GNET_BPF_ATTACH_TYPE]; +}; + +extern struct static_key_false gnet_bpf_enabled_key[MAX_GNET_BPF_ATTACH_TYPE]; +#define gnet_bpf_enabled(atype) static_branch_unlikely(&gnet_bpf_enabled_key[atype]) +extern struct gnet_bpf gnet_bpf_progs; + +int gnet_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +int gnet_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); + +static inline void run_gnet_bpf(enum gnet_bpf_attach_type atype, + struct bpf_gnet_ctx_kern *ctx) +{ + struct bpf_prog *prog; + + rcu_read_lock(); + prog = rcu_dereference(gnet_bpf_progs.progs[atype]); + if (unlikely(!prog)) + goto out; + + bpf_prog_run_pin_on_cpu(prog, ctx); +out: + rcu_read_unlock(); +} + +#endif + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ce3dfed6b91558a85c7db6541d024a27bedbfe1d..469d7e6a2cec1cd3f1495ddc26c671568a17b8f8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -924,7 +924,11 @@ struct sk_buff { /* public: */ KABI_USE2(1, __u8 scm_io_uring:1, __u8 local_skb:1) +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_skb *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index d415ecbd89585134b69c23e39c214222ceaafa93..e6a8d9dc972f23617cabe3b59ef627576800b54d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -173,7 +173,11 @@ struct net { struct netns_xfrm xfrm; #endif - atomic64_t net_cookie; /* written once */ +#ifdef __GENKSYMS__ + atomic64_t net_cookie; /* written once */ +#else + u64 net_cookie; /* written once */ +#endif #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; @@ -247,8 +251,6 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int fd); -u64 __net_gen_cookie(struct net *net); - #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); diff --git a/include/net/net_rship.h b/include/net/net_rship.h new file mode 100644 index 0000000000000000000000000000000000000000..ad8af5a5cb9b0fbd4cc34991bc5a7ad3d92b75df --- /dev/null +++ b/include/net/net_rship.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Common code for task relationship aware + * + * Copyright (C) 2024 Huawei Technologies Co., Ltd + * + */ + +#ifndef __LINUX_NET_RSHIP_H__ +#define __LINUX_NET_RSHIP_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + +struct sched_net_rship_skb { + /* for loopback traffic */ + pid_t alloc_tid; + + /* for phy nic */ + union { + u32 rx_dev_idx; /* rx */ + int dev_numa_node; /* tx */ + }; + u16 alloc_cpu; + u16 rx_queue_idx; + u64 rx_dev_net_cookie; +}; + +struct sk_buff_fclones_net_rship { + struct sk_buff_fclones fclones; + struct sched_net_rship_skb ext1; + struct sched_net_rship_skb ext2; +}; + +struct sk_buff_net_rship { + struct sk_buff skb; + struct sched_net_rship_skb ext; +}; + +struct sched_net_rship_sock { + /* for loopback traffic */ + pid_t sk_peer_tid; + u64 tid_rx_bytes; + unsigned long last_rx_update; + + /* for recv from phy nic */ + int rcv_numa_node; + u64 rcv_numa_node_bytes; + unsigned long last_rcv_numa_node_update; + + /* for send to phy nic */ + pid_t sk_send_tid; + int send_numa_node; + u64 send_numa_node_bytes; + unsigned long last_send_numa_node_update; +}; +#endif + +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && defined(CONFIG_BPF_NET_GLOBAL_PROG) + +#define NET_RSHIP_HEAD_RESERVE 40 +extern unsigned long net_numa_rship_jiffies; + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{ + sk->net_rship = (void *)(((char *)sk) + offset); + memset(sk->net_rship, 0, sizeof(struct sched_net_rship_sock)); + sk->net_rship->rcv_numa_node = NUMA_NO_NODE; + sk->net_rship->send_numa_node = NUMA_NO_NODE; +} + +static inline struct sched_net_rship_skb *__get_skb_net_rship(struct sk_buff *skb) +{ + return skb->net_rship; +} + +static inline bool net_rship_refresh_timeout(unsigned long last_update) +{ + return time_after(jiffies, net_numa_rship_jiffies + last_update); +} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (!gnet_bpf_enabled(GNET_SK_DST_SET)) + return; + + if (!in_task() || !dst) + return; + + if (dev_to_node(&dst->dev->dev) != NUMA_NO_NODE) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.numa_node = dev_to_node(&dst->dev->dev); + if (sk->net_rship->sk_send_tid) + ctx.curr_tid = sk->net_rship->sk_send_tid; + else + ctx.curr_tid = task_pid_nr(current); + ctx.sk = sk; + run_gnet_bpf(GNET_SK_DST_SET, &ctx); + } +} + +static inline void __net_rship_tcp_rcvmsg(struct sock *sk, pid_t tid) +{ + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.peer_tid = tid; + ctx.rxtx_bytes = sk->net_rship->tid_rx_bytes; + sk->net_rship->last_rx_update = jiffies; + run_gnet_bpf(GNET_TCP_RECVMSG, &ctx); + sk->net_rship->tid_rx_bytes = 0; +} + +static inline void net_rship_tcp_local(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_TCP_RECVMSG)) + return; + + ext = __get_skb_net_rship(skb); + if (!ext->alloc_tid) + return; + + if (sk->net_rship->sk_peer_tid != ext->alloc_tid) { + sk->net_rship->sk_peer_tid = ext->alloc_tid; + sk->net_rship->tid_rx_bytes = skb->len + NET_RSHIP_HEAD_RESERVE; + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } else { + sk->net_rship->tid_rx_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rx_update)) + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } +} + +static inline void net_rship_recv_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_RCV_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if (ext->alloc_tid || ext->rx_dev_idx == -1) + return; + + sk->net_rship->rcv_numa_node_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rcv_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.numa_node = cpu_to_node(ext->alloc_cpu); + ctx.rxtx_bytes = sk->net_rship->rcv_numa_node_bytes; + ctx.rx_dev_idx = ext->rx_dev_idx; + ctx.rx_dev_queue_idx = skb_get_rx_queue(skb); + ctx.rx_dev_netns_cookie = ext->rx_dev_net_cookie; + run_gnet_bpf(GNET_RCV_NIC_NODE, &ctx); + sk->net_rship->last_rcv_numa_node_update = jiffies; + sk->net_rship->rcv_numa_node_bytes = 0; + } +} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{ + net_rship_tcp_local(sk, skb); + net_rship_recv_nic_node(sk, skb); +} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if ((ext->dev_numa_node != NUMA_NO_NODE) && + sk->net_rship->sk_send_tid) { + sk->net_rship->send_numa_node_bytes += skb->len; + if (net_rship_refresh_timeout(sk->net_rship->last_send_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = sk->net_rship->sk_send_tid; + ctx.rxtx_bytes = sk->net_rship->send_numa_node_bytes; + ctx.numa_node = ext->dev_numa_node; + + run_gnet_bpf(GNET_SEND_NIC_NODE, &ctx); + sk->net_rship->send_numa_node_bytes = 0; + sk->net_rship->last_send_numa_node_update = jiffies; + } + } +} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_SEND_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->dev_numa_node = dev_to_node(&dev->dev); + } +} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->rx_dev_idx = dev->ifindex; + ext->rx_dev_net_cookie = dev_net(dev)->net_cookie; + } +} + +static inline void __net_rship_skb_clear(struct sched_net_rship_skb *ext) +{ + ext->alloc_tid = 0; + /* dev_name_node and rx_dev_idx */ + ext->dev_numa_node = NUMA_NO_NODE; +} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + __net_rship_skb_clear(ext); +} + +static inline void __net_rship_skb_init(struct sk_buff *skb) +{ + __net_rship_skb_clear(skb->net_rship); + skb->net_rship->alloc_cpu = raw_smp_processor_id(); +} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{ + struct sk_buff_net_rship *rskb = (void *)skb; + + skb->net_rship = &rskb->ext; + __net_rship_skb_init(skb); +} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{ + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones_net_rship *rskbs; + + rskbs = (void *)container_of(skb, struct sk_buff_fclones, skb1); + skb->net_rship = &rskbs->ext1; + rskbs->fclones.skb2.net_rship = &rskbs->ext2; + + __net_rship_skb_init(skb); + __net_rship_skb_init(&rskbs->fclones.skb2); + } else + net_rship_skb_init(skb); +} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ + n->net_rship->alloc_tid = skb->net_rship->alloc_tid; +} + +/* Make sure it is a process context */ +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{ + if (gnet_bpf_enabled(GNET_TCP_RECVMSG) || gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->alloc_tid = task_pid_nr(current); + } + if (gnet_bpf_enabled(GNET_SK_DST_SET) || gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + sk->net_rship->sk_send_tid = task_pid_nr(current); +} + +#else + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) +{} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{} + +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{} +#endif + +#endif diff --git a/include/net/sock.h b/include/net/sock.h index 00051f2558fa351ca89ae5353bd1b664b92a2339..7078c98f972629b8247065052999f66350de52ef 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,11 @@ struct sock { #else KABI_RESERVE(1) #endif +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_sock *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1ce7d3e143bbffd60056e472b1122..6456068242588f4ba48c482cdd95bbf7233aa305 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b87934003c407563770aa110aa1b7988f3b22cc4..c086cc287b47817cf8b30da00382a1db21e23d82 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, #ifndef __GENKSYMS__ BPF_PROG_TYPE_SCHED, + BPF_PROG_TYPE_NET_GLOBAL, #endif }; @@ -245,6 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3920,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4096,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5250,4 +5262,15 @@ enum { BTF_F_ZERO = (1ULL << 3), }; +struct bpf_gnet_ctx { + __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/init/Kconfig b/init/Kconfig index 758b9988d742f69976a787ec97b34031a147266d..c329e031689cc21718f4a81f7ad9c043c3188db4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1084,6 +1084,7 @@ config QOS_SCHED_DYNAMIC_AFFINITY config SCHED_TASK_RELATIONSHIP bool "task relationship" depends on NUMA_BALANCING + select BPF_NET_GLOBAL_PROG default n help This feature enables the scheduler to identify tasks relationship by diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba690c210f57a3f703f355c675265f81b7867014..7ccdb89b08c7c7fbef7843cc7e1589efcfe278ac 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2107,6 +2107,9 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: +#endif return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ @@ -3017,6 +3020,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_GNET_TCP_RECVMSG: + case BPF_GNET_SK_DST_SET: + case BPF_GNET_RCV_NIC_NODE: + case BPF_GNET_SEND_NIC_NODE: + return BPF_PROG_TYPE_NET_GLOBAL; +#endif default: return BPF_PROG_TYPE_UNSPEC; } @@ -3072,6 +3082,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: + ret = gnet_bpf_prog_attach(attr, ptype, prog); + break; +#endif default: ret = -EINVAL; } @@ -3108,6 +3123,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: return cgroup_bpf_prog_detach(attr, ptype); +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: + return gnet_bpf_prog_detach(attr, ptype); +#endif default: return -EINVAL; } diff --git a/net/Kconfig b/net/Kconfig index 232075ae15e232084ee33329c75cbc71f5c46e8f..6186e9ad88a34255dd61489620a6c451cbc9a5f6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -470,6 +470,12 @@ config ETHTOOL_NETLINK netlink. It provides better extensibility and some new features, e.g. notification messages. +config BPF_NET_GLOBAL_PROG + bool "Network global bpf prog type" + depends on NET + depends on BPF_SYSCALL + default n + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/dev.c b/net/core/dev.c index 1f1f93aad71c9225b65f5c742f09cd986de68f0a..8e0f4690e15717ca211a6c769548b70ad26fcd8e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,6 +146,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -3595,6 +3596,8 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); + net_rship_skb_record_dev_numa_node(skb, dev); + len = skb->len; PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); @@ -6197,6 +6200,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; + net_rship_skb_record_dev_rxinfo(skb, napi->dev); /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; diff --git a/net/core/filter.c b/net/core/filter.c index fff5d2d7c6c3c5af0c40017855ccc239efcc56ce..4f4e832f3e9f398b4de64afd7b59a55f0d93cf6d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4698,11 +4698,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return __net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) @@ -10684,3 +10682,213 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) return func; } + +#ifdef CONFIG_BPF_NET_GLOBAL_PROG +static DEFINE_MUTEX(gnet_bpf_mutex); +struct gnet_bpf gnet_bpf_progs; +EXPORT_SYMBOL(gnet_bpf_progs); +struct static_key_false gnet_bpf_enabled_key[MAX_GNET_BPF_ATTACH_TYPE]; +EXPORT_SYMBOL(gnet_bpf_enabled_key); + +int gnet_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + enum gnet_bpf_attach_type atype; + struct bpf_prog *attached; + int ret = 0; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + atype = to_gnet_bpf_attach_type(attr->attach_type); + if (atype < 0) + return -EINVAL; + + mutex_lock(&gnet_bpf_mutex); + attached = gnet_bpf_progs.progs[atype]; + if (attached == prog) { + ret = -EINVAL; + goto out_unlock; + } + + rcu_assign_pointer(gnet_bpf_progs.progs[atype], prog); + gnet_bpf_progs.flags[atype] = attr->attach_flags; + if (attached) + bpf_prog_put(attached); + else + static_branch_inc(&gnet_bpf_enabled_key[atype]); + +out_unlock: + mutex_unlock(&gnet_bpf_mutex); + return ret; +} + +int gnet_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + enum gnet_bpf_attach_type atype; + struct bpf_prog *attached; + int ret = 0; + + atype = to_gnet_bpf_attach_type(attr->attach_type); + if (atype < 0) + return -EINVAL; + + mutex_lock(&gnet_bpf_mutex); + attached = gnet_bpf_progs.progs[atype]; + if (!attached) { + ret = -ENOENT; + goto out_unlock; + } + + static_branch_dec(&gnet_bpf_enabled_key[atype]); + gnet_bpf_progs.flags[atype] = 0; + rcu_assign_pointer(gnet_bpf_progs.progs[atype], NULL); + bpf_prog_put(attached); +out_unlock: + mutex_unlock(&gnet_bpf_mutex); + return ret; +} + +static int __init gnet_bpf_init(void) +{ + return 0; +} +late_initcall(gnet_bpf_init); + +#include +BPF_CALL_3(bpf_sched_net_rship_submit, void *, reqbuf, size_t, sz, u64, flags) +{ +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) + struct net_relationship_req *req = reqbuf; + + if (sz != sizeof(struct net_relationship_req)) + return -EINVAL; + + return sched_net_relationship_submit(req); +#else + return 0; +#endif +} + +const struct bpf_func_proto bpf_sched_net_rship_submit_proto = { + .func = bpf_sched_net_rship_submit, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + case BPF_FUNC_sched_net_rship_submit: + return &bpf_sched_net_rship_submit_proto; + default: + break; + } + + return bpf_sk_base_func_proto(func_id); +} + +static bool bpf_gnet_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_gnet_ctx)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct bpf_gnet_ctx, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; + default: + break; + } + return true; +} + +static u32 bpf_gnet_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_gnet_ctx, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, sk)); + break; + case offsetof(struct bpf_gnet_ctx, numa_node): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, numa_node), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, numa_node)); + break; + case offsetof(struct bpf_gnet_ctx, curr_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, curr_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, curr_tid)); + break; + case offsetof(struct bpf_gnet_ctx, peer_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, peer_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, peer_tid)); + break; + case offsetof(struct bpf_gnet_ctx, rxtx_bytes): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rxtx_bytes), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rxtx_bytes)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_queue_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_queue_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_queue_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_netns_cookie): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, + rx_dev_netns_cookie), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_netns_cookie)); + break; + } + return insn - insn_buf; +} + +static int bpf_gnet_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return 0; +} + +const struct bpf_verifier_ops bpf_gnet_verifier_ops = { + .get_func_proto = bpf_gnet_func_proto, + .is_valid_access = bpf_gnet_is_valid_access, + .convert_ctx_access = bpf_gnet_convert_ctx_access, + .gen_prologue = bpf_gnet_gen_prologue, +}; + +const struct bpf_prog_ops bpf_gnet_prog_ops = { +}; +#endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e05dd4f3279a8c119d53a96bf72f5a6ca3660e3f..20a0fc4d059bf2ae3b2847d9cabe426ed5d6fe4d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); -u64 __net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = gen_cookie_next(&net_cookie); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} - static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -341,6 +329,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1128,10 +1119,6 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - preempt_disable(); - __net_gen_cookie(&init_net); - preempt_enable(); - down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b290db716392b973cc874bd4a65486dd0a5c2ea7..779a860e007312fec8f2eefab10dc7859a09b791 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -254,6 +255,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb_set_kcov_handle(skb, kcov_common_handle()); + net_rship_skb_init_flags(skb, flags); out: return skb; nodata: @@ -289,6 +291,7 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb, skb_set_kcov_handle(skb, kcov_common_handle()); + net_rship_skb_init(skb); return skb; } @@ -485,6 +488,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; + net_rship_skb_record_dev_rxinfo(skb, dev); skb_fail: return skb; @@ -549,6 +553,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; + net_rship_skb_record_dev_rxinfo(skb, napi->dev); skb_fail: return skb; @@ -996,7 +1001,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif - + net_rship_skb_clone(new, (void *)old); } /* @@ -1476,6 +1481,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; n->fclone = SKB_FCLONE_UNAVAILABLE; + net_rship_skb_init(n); } return __skb_clone(n, skb); @@ -3428,6 +3434,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); + net_rship_skb_clone(skb1, skb); } EXPORT_SYMBOL(skb_split); @@ -4438,14 +4445,22 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_net_rship), +#else sizeof(struct sk_buff), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_fclones_net_rship), +#else sizeof(struct sk_buff_fclones), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/core/sock.c b/net/core/sock.c index a15e984bd38544a091021f9bf8bd85c5b21537be..da0c980ad238a616f2237df21e71ee417bdebc78 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -138,6 +138,7 @@ #include #include +#include static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -1619,6 +1620,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_bound_dev_if; break; + case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1669,12 +1677,18 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; +#endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + void *net_rship = nsk->net_rship; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + nsk->net_rship = net_rship; +#endif #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); @@ -1695,7 +1709,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + sk = kmalloc(prot->obj_size + sizeof(struct sched_net_rship_sock), + priority); +#else sk = kmalloc(prot->obj_size, priority); +#endif if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) @@ -1704,6 +1723,9 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + net_rship_sock_init(sk, prot->obj_size); +#endif } return sk; @@ -2038,6 +2060,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + + net_rship_sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -3505,7 +3529,11 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + prot->obj_size + sizeof(struct sched_net_rship_sock), 0, +#else prot->obj_size, 0, +#endif SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0dfe9f255ab3af2f49a5652bcf581a557e7de647..ea0ee32f27bbced9b7e75582e172faedf8309650 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -21,6 +21,7 @@ #include #include #include +#include static int two = 2; static int three = 3; @@ -45,6 +46,12 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +unsigned long net_numa_rship_jiffies __read_mostly = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_min = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_max = 100 * HZ; /* 100s */ +#endif + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -575,6 +582,17 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + { + .procname = "numa_rship_ms", + .data = &net_numa_rship_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &net_numa_rship_ms_min, + .extra2 = &net_numa_rship_ms_max, + }, +#endif { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 830d6b2039f57778e1ccf2a4c2c2c53879a87489..0f13dc16773062dc06625a7228bd831e72028ede 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include #include +#include DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); @@ -884,6 +885,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_shinfo(skb)->tx_flags = 0; memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); + net_rship_skb_clear(skb); return skb; } } @@ -1321,6 +1323,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (!skb) goto wait_for_space; + net_rship_record_sendmsginfo(skb, sk); + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; @@ -2367,6 +2371,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (used + offset < skb->len) continue; + net_rship_tcp_recvmsg(sk, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d9b50a3addee6f9e4a88d195e36967802a60d032..762f2009d61d14399c65d41f9f9e26ed995c0e83 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -1196,6 +1197,8 @@ void tcp_wfree(struct sk_buff *skb) */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); + net_rship_send_nic_node(sk, skb); + /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 2c85586ec224f1e6a1912898b613cd071342f5ee..22be05e8dbb40eced837960950affe8d3180e265 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -65,6 +65,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SCHED] = "sched", + [BPF_PROG_TYPE_NET_GLOBAL] = "gnet", }; const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); @@ -79,6 +80,10 @@ static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_GNET_TCP_RECVMSG] = "gnet_tcp_recvmsg", + [BPF_GNET_SK_DST_SET] = "gnet_sk_dst_set", + [BPF_GNET_RCV_NIC_NODE] = "gnet_rcv_nic_node", + [BPF_GNET_SEND_NIC_NODE] = "gnet_send_nic_node", [__MAX_BPF_ATTACH_TYPE] = NULL, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5a153a1a8f18a4758864366630384d84b48b1eb0..254b5118921dcdde5e6cf118c7c7592e0ff26dfe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -201,6 +201,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, #ifndef __GENKSYMS__ BPF_PROG_TYPE_SCHED, + BPF_PROG_TYPE_NET_GLOBAL, #endif }; @@ -245,6 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3920,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4096,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4382,6 +4394,10 @@ struct bpf_sock { __u32 dst_ip6[4]; __u32 state; __s32 rx_queue_mapping; + __s32 sk_send_tid; + __s32 sk_peer_tid; + __u64 rcv_tid_bytes; + __u64 rcv_numa_node_bytes; }; struct bpf_tcp_sock { @@ -5250,4 +5266,15 @@ enum { BTF_F_ZERO = (1ULL << 3), }; +struct bpf_gnet_ctx { + __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2838812422227ca0d5f9398afa65e6a705a75630..b7f71d2d7d53f1ae4974050df9b8fadb4a927c2b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8494,6 +8494,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("struct_ops", BPF_PROG_TYPE_STRUCT_OPS), BPF_EAPROG_SEC("sk_lookup/", BPF_PROG_TYPE_SK_LOOKUP, BPF_SK_LOOKUP), + BPF_EAPROG_SEC("gnet/tcp_recvmsg", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_TCP_RECVMSG), + BPF_EAPROG_SEC("gnet/sk_dst_set", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SK_DST_SET), + BPF_EAPROG_SEC("gnet/rcv_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_RCV_NIC_NODE), + BPF_EAPROG_SEC("gnet/send_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SEND_NIC_NODE), }; #undef BPF_PROG_SEC_IMPL diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 13393f0eab25c892baef51e61b1e832282d1dd8c..73aef4467823b725f1488c05b88d00cdf90093df 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -111,6 +111,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_NET_GLOBAL: default: break; }