diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 4085765c337054b5770043d327653aebb88843db..bb7fd376a8c5fe5ec4e6e3f819729165e60ca1c6 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -377,4 +377,10 @@ int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) +#if IS_ENABLED(CONFIG_NETACC_TERRACE) +typedef int (*bpf_getorigdst_opt_func)(struct sock *sk, int optname, + void *optval, int *optlen, int dir); +extern bpf_getorigdst_opt_func bpf_getorigdst_opt; +#endif + #endif /* _NF_CONNTRACK_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 7753354d59c0b30d0d482fdae7ae108b89dfdeb4..873b81ceee5f2e4740243ac0e7c0d3b9ab290842 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -300,6 +300,7 @@ struct sk_filter; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_gid: group id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -545,6 +546,13 @@ struct sock { struct rcu_head sk_rcu; netns_tracker ns_tracker; struct hlist_node sk_bind2_node; + +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + union { + kgid_t sk_gid; + u64 sk_gid_padding; + }; +#endif }; enum sk_pacing { @@ -2116,6 +2124,9 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = SOCK_INODE(parent)->i_gid; +#endif security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } @@ -2129,6 +2140,13 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); } +#if IS_ENABLED(CONFIG_NETACC_TERRACE) +static inline kgid_t sock_net_gid(const struct net *net, const struct sock *sk) +{ + return sk ? sk->sk_gid : make_kgid(net->user_ns, 0); +} +#endif + static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..5cd5752e372ada33badab8c585ed82b6d6f482e9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5642,6 +5642,17 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * u64 bpf_get_sockops_uid_gid(void *sockops) + * Description + * Get sock's uid and gid + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * int bpf_sk_original_addr(void *bpf_socket, int optname, char *optval, int optlen) + * Description + * Get Ipv4 origdst or replysrc. Works with IPv4. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5856,6 +5867,8 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(get_sockops_uid_gid, 212, ##ctx) \ + FN(sk_original_addr, 213, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index 155e77d6a42d0d60f75a4e9bc1fa7e7c2f074fdf..00e78cc2782ba3c3dc0cac222cf9abbbf25eee38 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -50,6 +50,8 @@ enum nf_ip_hook_priorities { /* 2.2 firewalling (+ masq) went from 64 through 76 */ /* 2.4 firewalling went 64 through 67. */ #define SO_ORIGINAL_DST 80 +#define BPF_SO_ORIGINAL_DST 800 +#define BPF_SO_REPLY_SRC 801 #endif /* _UAPI__LINUX_IP_NETFILTER_H */ diff --git a/net/Kconfig b/net/Kconfig index 7fbd17e188a52e3c8c39e272296b2e4ce8ab0aaf..c976c72de26b1123250a5e1c5475f497086f3dcc 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -514,4 +514,11 @@ config NETACC_BPF help Network acceleration in bpf. +config NETACC_TERRACE + bool "Terrace Service Acceleration" + default y + help + Accelerating intra-node communication on the data plane of the + Terrace service. + endif # if NET diff --git a/net/core/filter.c b/net/core/filter.c index d6905153cba26305d9d13f960a3bb30f565c3214..f9cd96b99c0622c3bc9383066e3a63394335fc77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5474,6 +5474,78 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) +BPF_CALL_1(bpf_get_sockops_uid_gid, struct bpf_sock_ops_kern *, bpf_sock) +{ + struct sock *sk = bpf_sock->sk; + kuid_t uid; + kgid_t gid; + + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + + uid = sock_net_uid(sock_net(sk), sk); + gid = sock_net_gid(sock_net(sk), sk); + + return ((u64)from_kgid_munged(sock_net(sk)->user_ns, gid)) << 32 | + from_kuid_munged(sock_net(sk)->user_ns, uid); +} + +static const struct bpf_func_proto bpf_get_sockops_uid_gid_proto = { + .func = bpf_get_sockops_uid_gid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +#include +#include + +bpf_getorigdst_opt_func bpf_getorigdst_opt; +EXPORT_SYMBOL(bpf_getorigdst_opt); + +BPF_CALL_4(bpf_sk_original_addr, struct bpf_sock_ops_kern *, bpf_sock, + int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = -EINVAL; + + if (!sk_fullsock(sk)) + goto err_clear; + + if (optname != BPF_SO_ORIGINAL_DST && optname != BPF_SO_REPLY_SRC) + goto err_clear; + + if (!bpf_getorigdst_opt) + goto err_clear; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (optname == BPF_SO_ORIGINAL_DST) + ret = bpf_getorigdst_opt(sk, optname, optval, &optlen, + IP_CT_DIR_ORIGINAL); + else if (optname == BPF_SO_REPLY_SRC) + ret = bpf_getorigdst_opt(sk, optname, optval, &optlen, + IP_CT_DIR_REPLY); + if (ret < 0) + goto err_clear; + + return 0; +#endif +err_clear: + memset(optval, 0, optlen); + return ret; +} + +static const struct bpf_func_proto bpf_sk_original_addr_proto = { + .func = bpf_sk_original_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; +#endif + BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { @@ -8205,6 +8277,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + case BPF_FUNC_get_sockops_uid_gid: + return &bpf_get_sockops_uid_gid_proto; + case BPF_FUNC_sk_original_addr: + return &bpf_sk_original_addr_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; @@ -8608,6 +8686,9 @@ static bool __sock_filter_check_attach_type(int off, case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + case BPF_CGROUP_INET_SOCK_RELEASE: +#endif goto read_only; default: return false; @@ -8623,6 +8704,9 @@ static bool __sock_filter_check_attach_type(int off, switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + case BPF_CGROUP_INET_SOCK_RELEASE: +#endif goto read_only; default: return false; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 93ecfceac1bc49bd843728518215ade5ced374a5..54a8300b4b3e45efeb326d15ff05250c4af73f76 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -444,8 +444,12 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + atomic_sub(copy, &sk->sk_rmem_alloc); +#endif sk_mem_uncharge(sk, copy); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -771,6 +775,10 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); +#endif sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/core/sock.c b/net/core/sock.c index bfaf47b3f3c7cdca804a6aa164bfffe857b61b0e..c77326a07906cf695d63328b733c53c0fd9bab73 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3426,8 +3426,14 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = SOCK_INODE(sock)->i_gid; +#endif } else { RCU_INIT_POINTER(sk->sk_wq, NULL); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = make_kgid(sock_net(sk)->user_ns, 0); +#endif } sk->sk_uid = uid; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2dbfcdd3b65bdc9f2b839a6f51687..835e65ae361e9d1494fa9f67859a241899ac1bf5 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -43,6 +43,13 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, return -ENOMEM; lock_sock(sk); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + kfree(tmp); + release_sock(sk); + return -EAGAIN; + } +#endif tmp->sg.start = msg->sg.start; i = msg->sg.start; do { @@ -75,6 +82,9 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; sk_psock_queue_msg(psock, tmp); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + atomic_add(tmp->sg.size, &sk->sk_rmem_alloc); +#endif sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index c928ff63b10e4755b1fe19c4aad04e2f869a318f..37f1ae00497a5d429d65f352e77316a3d2d253c2 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -311,6 +311,69 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } +#if IS_ENABLED(CONFIG_NETACC_TERRACE) +static int +bpf_getorigdst_impl(struct sock *sk, int optval, void *user, int *len, int dir) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int)*len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + if (dir == IP_CT_DIR_REPLY) { + sin.sin_port = ct->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.u3.ip; + } else { + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + } + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + + memcpy(user, &sin, sizeof(sin)); + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} +#endif + static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -655,6 +718,10 @@ int nf_conntrack_proto_init(void) goto cleanup_sockopt; #endif +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + bpf_getorigdst_opt = bpf_getorigdst_impl; +#endif + return ret; #if IS_ENABLED(CONFIG_IPV6) @@ -666,6 +733,10 @@ int nf_conntrack_proto_init(void) void nf_conntrack_proto_fini(void) { +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + bpf_getorigdst_opt = NULL; +#endif + nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6); diff --git a/net/socket.c b/net/socket.c index c4a6f55329552d9c80a41b1b691ce305c22f09f8..84d42997abaf25272636ea987ffc4b55009694c4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -604,10 +604,14 @@ static int sockfs_setattr(struct mnt_idmap *idmap, if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - if (sock->sk) + if (sock->sk) { sock->sk->sk_uid = iattr->ia_uid; - else +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sock->sk->sk_gid = iattr->ia_gid; +#endif + } else { err = -ENOENT; + } } return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4924f0cde1bcacd9ee3419aec56dad680094637d..5cd5752e372ada33badab8c585ed82b6d6f482e9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5642,6 +5642,17 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * u64 bpf_get_sockops_uid_gid(void *sockops) + * Description + * Get sock's uid and gid + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * int bpf_sk_original_addr(void *bpf_socket, int optname, char *optval, int optlen) + * Description + * Get Ipv4 origdst or replysrc. Works with IPv4. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5856,6 +5867,8 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(get_sockops_uid_gid, 212, ##ctx) \ + FN(sk_original_addr, 213, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't