From dbd9755aa4fac221e1b0504ffabbef3b3c0bdbac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 03:42:17 +0000 Subject: [PATCH 1/5] inet: lockless getsockopt(IP_OPTIONS) mainline inclusion from mainline-v6.7-rc1 commit a4725d0d893599253a4bb283fdabdd4a66d9451d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=a4725d0d893599253a4bb283fdabdd4a66d9451d -------------------------------- inet->inet_opt being RCU protected, we can use RCU instead of locking the socket. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller Reviewed-by: Jackie Liu Signed-off-by: Geliang Tang --- net/ipv4/ip_sockglue.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7718ee5c8d5b..0e307fe911fa 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1600,27 +1600,20 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TOS: val = READ_ONCE(inet->tos); goto copyval; - } - - if (needs_rtnl) - rtnl_lock(); - sockopt_lock_sock(sk); - - switch (optname) { case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; struct ip_options *opt = (struct ip_options *)optbuf; struct ip_options_rcu *inet_opt; - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); - sockopt_release_sock(sk); + rcu_read_unlock(); if (opt->optlen == 0) { len = 0; @@ -1636,6 +1629,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } + } + + if (needs_rtnl) + rtnl_lock(); + sockopt_lock_sock(sk); + + switch (optname) { case IP_MTU: { struct dst_entry *dst; -- Gitee From a18b2d1ce14b4761f2a04a2cbf390eb104985b62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 03:42:18 +0000 Subject: [PATCH 2/5] inet: lockless getsockopt(IP_MTU) mainline inclusion from mainline-v6.7-rc1 commit 3523bc91e4b4da39ccf18a0252d13108877ece0a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=3523bc91e4b4da39ccf18a0252d13108877ece0a -------------------------------- sk_dst_get() does not require socket lock. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller Reviewed-by: Jackie Liu Signed-off-by: Geliang Tang --- net/ipv4/ip_sockglue.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0e307fe911fa..77739a82cb3d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1629,13 +1629,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } - } - - if (needs_rtnl) - rtnl_lock(); - sockopt_lock_sock(sk); - - switch (optname) { case IP_MTU: { struct dst_entry *dst; @@ -1645,12 +1638,17 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, val = dst_mtu(dst); dst_release(dst); } - if (!val) { - sockopt_release_sock(sk); + if (!val) return -ENOTCONN; - } - break; + goto copyval; + } } + + if (needs_rtnl) + rtnl_lock(); + sockopt_lock_sock(sk); + + switch (optname) { case IP_UNICAST_IF: val = (__force int)htonl((__u32) inet->uc_index); break; -- Gitee From 38bd05d3f08ae87103ab2a9593fbc5ba15495b51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 03:42:19 +0000 Subject: [PATCH 3/5] inet: implement lockless getsockopt(IP_UNICAST_IF) mainline inclusion from mainline-v6.7-rc1 commit 959d5c11601b2b337c364b2e3102d392365e3dd3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=959d5c11601b2b337c364b2e3102d392365e3dd3 -------------------------------- Add missing READ_ONCE() annotations when reading inet->uc_index Implementing getsockopt(IP_UNICAST_IF) locklessly seems possible, the setsockopt() part might not be possible at the moment. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller Reviewed-by: Jackie Liu Signed-off-by: Geliang Tang --- net/ipv4/datagram.c | 2 +- net/ipv4/ip_sockglue.c | 10 +++++----- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 13 +++++++------ net/ipv4/udp.c | 12 +++++++----- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index cb5dbee9e018..1480e9ebdfef 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -43,7 +43,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len if (!saddr) saddr = inet->mc_addr; } else if (!oif) { - oif = inet->uc_index; + oif = READ_ONCE(inet->uc_index); } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 77739a82cb3d..962d4a254f07 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1120,7 +1120,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, ifindex = (__force int)ntohl((__force __be32)val); if (ifindex == 0) { - inet->uc_index = 0; + WRITE_ONCE(inet->uc_index, 0); err = 0; break; } @@ -1137,7 +1137,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; - inet->uc_index = ifindex; + WRITE_ONCE(inet->uc_index, ifindex); err = 0; break; } @@ -1642,6 +1642,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -ENOTCONN; goto copyval; } + case IP_UNICAST_IF: + val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index)); + goto copyval; } if (needs_rtnl) @@ -1649,9 +1652,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, sockopt_lock_sock(sk); switch (optname) { - case IP_UNICAST_IF: - val = (__force int)htonl((__u32) inet->uc_index); - break; case IP_MULTICAST_IF: { struct in_addr addr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d06cdac6b7f9..7066c3e0f410 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -777,7 +777,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!saddr) saddr = inet->mc_addr; } else if (!ipc.oif) - ipc.oif = inet->uc_index; + ipc.oif = READ_ONCE(inet->uc_index); flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8fba84f1bab0..632a0f09e9d6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -483,7 +483,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; __be32 daddr; __be32 saddr; - int err; + int uc_index, err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; @@ -577,24 +577,25 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); + uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; } else if (!ipc.oif) { - ipc.oif = inet->uc_index; - } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + ipc.oif = uc_index; + } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ - if (ipc.oif != inet->uc_index && + if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), - inet->uc_index)) { - ipc.oif = inet->uc_index; + uc_index)) { + ipc.oif = uc_index; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ed2566213e35..033ea0a12f0d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1063,6 +1063,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; + int uc_index; if (len > 0xFFFF) return -EMSGSIZE; @@ -1184,6 +1185,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (scope == RT_SCOPE_LINK) connected = 0; + uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; @@ -1191,18 +1193,18 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = inet->mc_addr; connected = 0; } else if (!ipc.oif) { - ipc.oif = inet->uc_index; - } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + ipc.oif = uc_index; + } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ - if (ipc.oif != inet->uc_index && + if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), - inet->uc_index)) { - ipc.oif = inet->uc_index; + uc_index)) { + ipc.oif = uc_index; } } -- Gitee From 406492d968dd5d0ad6b8334c09888536a6adddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 03:42:20 +0000 Subject: [PATCH 4/5] inet: lockless IP_PKTOPTIONS implementation mainline inclusion from mainline-v6.7-rc1 commit c4480eb5504c9771f935cbca58a3b874bdd36af8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=c4480eb5504c9771f935cbca58a3b874bdd36af8 -------------------------------- Current implementation is already lockless, because the socket lock is released before reading socket fields. Add missing READ_ONCE() annotations. Note that corresponding WRITE_ONCE() are needed, the order of the patches do not really matter. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller Reviewed-by: Jackie Liu Signed-off-by: Geliang Tang --- net/ipv4/ip_sockglue.c | 76 ++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 962d4a254f07..f0ce18f9f1fc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1642,6 +1642,43 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -ENOTCONN; goto copyval; } + case IP_PKTOPTIONS: + { + struct msghdr msg; + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + if (optval.is_kernel) { + msg.msg_control_is_user = false; + msg.msg_control = optval.kernel; + } else { + msg.msg_control_is_user = true; + msg.msg_control_user = optval.user; + } + msg.msg_controllen = len; + msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; + + if (inet_test_bit(PKTINFO, sk)) { + struct in_pktinfo info; + + info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr); + info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr); + info.ipi_ifindex = READ_ONCE(inet->mc_index); + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); + } + if (inet_test_bit(TTL, sk)) { + int hlim = READ_ONCE(inet->mc_ttl); + + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + if (inet_test_bit(TOS, sk)) { + int tos = READ_ONCE(inet->rcv_tos); + put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); + } + len -= msg.msg_controllen; + return copy_to_sockptr(optlen, &len, sizeof(int)); + } case IP_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index)); goto copyval; @@ -1687,45 +1724,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - case IP_PKTOPTIONS: - { - struct msghdr msg; - - sockopt_release_sock(sk); - - if (sk->sk_type != SOCK_STREAM) - return -ENOPROTOOPT; - - if (optval.is_kernel) { - msg.msg_control_is_user = false; - msg.msg_control = optval.kernel; - } else { - msg.msg_control_is_user = true; - msg.msg_control_user = optval.user; - } - msg.msg_controllen = len; - msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; - - if (inet_test_bit(PKTINFO, sk)) { - struct in_pktinfo info; - - info.ipi_addr.s_addr = inet->inet_rcv_saddr; - info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; - info.ipi_ifindex = inet->mc_index; - put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); - } - if (inet_test_bit(TTL, sk)) { - int hlim = READ_ONCE(inet->mc_ttl); - - put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); - } - if (inet_test_bit(TOS, sk)) { - int tos = inet->rcv_tos; - put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); - } - len -= msg.msg_controllen; - return copy_to_sockptr(optlen, &len, sizeof(int)); - } case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; -- Gitee From a6bb70f52cd2151ff7fd8d68adf411776ace2c5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 03:42:21 +0000 Subject: [PATCH 5/5] inet: implement lockless getsockopt(IP_MULTICAST_IF) mainline inclusion from mainline-v6.7-rc1 commit 02715925222c137f418ecac417b68c7801e8f729 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=02715925222c137f418ecac417b68c7801e8f729 -------------------------------- Add missing annotations to inet->mc_index and inet->mc_addr to fix data-races. getsockopt(IP_MULTICAST_IF) can be lockless. setsockopt() side is left for later. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller Reviewed-by: Jackie Liu Signed-off-by: Geliang Tang --- net/ipv4/datagram.c | 4 ++-- net/ipv4/ip_sockglue.c | 25 ++++++++++++------------- net/ipv4/ping.c | 4 ++-- net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 4 ++-- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 1480e9ebdfef..2cc50cbfc2a3 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -39,9 +39,9 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) - oif = inet->mc_index; + oif = READ_ONCE(inet->mc_index); if (!saddr) - saddr = inet->mc_addr; + saddr = READ_ONCE(inet->mc_addr); } else if (!oif) { oif = READ_ONCE(inet->uc_index); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f0ce18f9f1fc..8a88e705d827 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1175,8 +1175,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (!mreq.imr_ifindex) { if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { - inet->mc_index = 0; - inet->mc_addr = 0; + WRITE_ONCE(inet->mc_index, 0); + WRITE_ONCE(inet->mc_addr, 0); err = 0; break; } @@ -1201,8 +1201,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, midx != sk->sk_bound_dev_if) break; - inet->mc_index = mreq.imr_ifindex; - inet->mc_addr = mreq.imr_address.s_addr; + WRITE_ONCE(inet->mc_index, mreq.imr_ifindex); + WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr); err = 0; break; } @@ -1682,19 +1682,11 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index)); goto copyval; - } - - if (needs_rtnl) - rtnl_lock(); - sockopt_lock_sock(sk); - - switch (optname) { case IP_MULTICAST_IF: { struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); - addr.s_addr = inet->mc_addr; - sockopt_release_sock(sk); + addr.s_addr = READ_ONCE(inet->mc_addr); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; @@ -1702,6 +1694,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } + } + + if (needs_rtnl) + rtnl_lock(); + sockopt_lock_sock(sk); + + switch (optname) { case IP_MSFILTER: { struct ip_msfilter msf; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7066c3e0f410..8652cdba4848 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -773,9 +773,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) - ipc.oif = inet->mc_index; + ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) - saddr = inet->mc_addr; + saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 632a0f09e9d6..a1d8218fa1a2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -580,9 +580,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) - ipc.oif = inet->mc_index; + ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) - saddr = inet->mc_addr; + saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 033ea0a12f0d..ec07d9429ead 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1188,9 +1188,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) - ipc.oif = inet->mc_index; + ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) - saddr = inet->mc_addr; + saddr = READ_ONCE(inet->mc_addr); connected = 0; } else if (!ipc.oif) { ipc.oif = uc_index; -- Gitee