From fd5c1514ebe73e53d6d1861e10dab8cc370dcd97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:38 +0800 Subject: [PATCH 01/29] netlink: fix potential deadlock in netlink_set_err() stable inclusion from stable-v4.19.291 commit 4b9adb8d4a62ff7608d4a7d4eb42036a88f30980 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 8d61f926d42045961e6b65191c09e3678d86a9cf ] syzbot reported a possible deadlock in netlink_set_err() [1] A similar issue was fixed in commit 1d482e666b8e ("netlink: disable IRQs for netlink_lock_table()") in netlink_lock_table() This patch adds IRQ safety to netlink_set_err() and __netlink_diag_dump() which were not covered by cited commit. [1] WARNING: possible irq lock inversion dependency detected 6.4.0-rc6-syzkaller-00240-g4e9f0ec38852 #0 Not tainted syz-executor.2/23011 just changed the state of lock: ffffffff8e1a7a58 (nl_table_lock){.+.?}-{2:2}, at: netlink_set_err+0x2e/0x3a0 net/netlink/af_netlink.c:1612 but this lock was taken by another, SOFTIRQ-safe lock in the past: (&local->queue_stop_reason_lock){..-.}-{2:2} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(nl_table_lock); local_irq_disable(); lock(&local->queue_stop_reason_lock); lock(nl_table_lock); lock(&local->queue_stop_reason_lock); *** DEADLOCK *** Fixes: 1d482e666b8e ("netlink: disable IRQs for netlink_lock_table()") Reported-by: syzbot+a7d200a347f912723e5c@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=a7d200a347f912723e5c Link: https://lore.kernel.org/netdev/000000000000e38d1605fea5747e@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Johannes Berg Link: https://lore.kernel.org/r/20230621154337.1668594-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/netlink/af_netlink.c | 5 +++-- net/netlink/diag.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4c4cff538c01..6b5be7a426ac 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1602,6 +1602,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; + unsigned long flags; struct sock *sk; int ret = 0; @@ -1611,12 +1612,12 @@ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) /* sk->sk_err wants a positive error value */ info.code = -code; - read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 7dda33b9b784..83a0429805e9 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -93,6 +93,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; + unsigned long flags; struct sock *sk; int num = 2; int ret = 0; @@ -151,7 +152,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, num++; mc_list: - read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; @@ -172,7 +173,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, } num++; } - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; -- Gitee From 0ca2fda64d6899beb0bb385c42f2bf00b69eac7a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Sep 2023 20:50:15 +0800 Subject: [PATCH 02/29] netlink: Add __sock_i_ino() for __netlink_diag_dump(). stable inclusion from stable-v4.19.291 commit 43a6b3b4c7563851a7bbf64db3ef5b26f5228f68 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 25a9c8a4431c364f97f75558cb346d2ad3f53fbb ] syzbot reported a warning in __local_bh_enable_ip(). [0] Commit 8d61f926d420 ("netlink: fix potential deadlock in netlink_set_err()") converted read_lock(&nl_table_lock) to read_lock_irqsave() in __netlink_diag_dump() to prevent a deadlock. However, __netlink_diag_dump() calls sock_i_ino() that uses read_lock_bh() and read_unlock_bh(). If CONFIG_TRACE_IRQFLAGS=y, read_unlock_bh() finally enables IRQ even though it should stay disabled until the following read_unlock_irqrestore(). Using read_lock() in sock_i_ino() would trigger a lockdep splat in another place that was fixed in commit f064af1e500a ("net: fix a lockdep splat"), so let's add __sock_i_ino() that would be safe to use under BH disabled. [0]: WARNING: CPU: 0 PID: 5012 at kernel/softirq.c:376 __local_bh_enable_ip+0xbe/0x130 kernel/softirq.c:376 Modules linked in: CPU: 0 PID: 5012 Comm: syz-executor487 Not tainted 6.4.0-rc7-syzkaller-00202-g6f68fc395f49 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 RIP: 0010:__local_bh_enable_ip+0xbe/0x130 kernel/softirq.c:376 Code: 45 bf 01 00 00 00 e8 91 5b 0a 00 e8 3c 15 3d 00 fb 65 8b 05 ec e9 b5 7e 85 c0 74 58 5b 5d c3 65 8b 05 b2 b6 b4 7e 85 c0 75 a2 <0f> 0b eb 9e e8 89 15 3d 00 eb 9f 48 89 ef e8 6f 49 18 00 eb a8 0f RSP: 0018:ffffc90003a1f3d0 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000201 RCX: 1ffffffff1cf5996 RDX: 0000000000000000 RSI: 0000000000000201 RDI: ffffffff8805c6f3 RBP: ffffffff8805c6f3 R08: 0000000000000001 R09: ffff8880152b03a3 R10: ffffed1002a56074 R11: 0000000000000005 R12: 00000000000073e4 R13: dffffc0000000000 R14: 0000000000000002 R15: 0000000000000000 FS: 0000555556726300(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000045ad50 CR3: 000000007c646000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sock_i_ino+0x83/0xa0 net/core/sock.c:2559 __netlink_diag_dump+0x45c/0x790 net/netlink/diag.c:171 netlink_diag_dump+0xd6/0x230 net/netlink/diag.c:207 netlink_dump+0x570/0xc50 net/netlink/af_netlink.c:2269 __netlink_dump_start+0x64b/0x910 net/netlink/af_netlink.c:2374 netlink_dump_start include/linux/netlink.h:329 [inline] netlink_diag_handler_dump+0x1ae/0x250 net/netlink/diag.c:238 __sock_diag_cmd net/core/sock_diag.c:238 [inline] sock_diag_rcv_msg+0x31e/0x440 net/core/sock_diag.c:269 netlink_rcv_skb+0x165/0x440 net/netlink/af_netlink.c:2547 sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:280 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x547/0x7f0 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x925/0xe30 net/netlink/af_netlink.c:1914 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg+0xde/0x190 net/socket.c:747 ____sys_sendmsg+0x71c/0x900 net/socket.c:2503 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2557 __sys_sendmsg+0xf7/0x1c0 net/socket.c:2586 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5303aaabb9 Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc7506e548 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5303aaabb9 RDX: 0000000000000000 RSI: 0000000020000180 RDI: 0000000000000003 RBP: 00007f5303a6ed60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f5303a6edf0 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: 8d61f926d420 ("netlink: fix potential deadlock in netlink_set_err()") Reported-by: syzbot+5da61cf6a9bc1902d422@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=5da61cf6a9bc1902d422 Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230626164313.52528-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/net/sock.h | 1 + net/core/sock.c | 17 ++++++++++++++--- net/netlink/diag.c | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 5c544d1c1cb5..439560049a93 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1887,6 +1887,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) } kuid_t sock_i_uid(struct sock *sk); +unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index b18c54ba80b5..b36f16b020fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1961,13 +1961,24 @@ kuid_t sock_i_uid(struct sock *sk) } EXPORT_SYMBOL(sock_i_uid); -unsigned long sock_i_ino(struct sock *sk) +unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(__sock_i_ino); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + local_bh_disable(); + ino = __sock_i_ino(sk); + local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 83a0429805e9..85ee4891c2c7 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -167,7 +167,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - sock_i_ino(sk)) < 0) { + __sock_i_ino(sk)) < 0) { ret = 1; break; } -- Gitee From b8bd31a494dbe6669e8c1857e472a33445aef285 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:39 +0800 Subject: [PATCH 03/29] netlink: do not hard code device address lenth in fdb dumps stable inclusion from stable-v4.19.291 commit c3ad49ff5c030cbe719fc4cb0ae081b8255ef4b3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit aa5406950726e336c5c9585b09799a734b6e77bf ] syzbot reports that some netdev devices do not have a six bytes address [1] Replace ETH_ALEN by dev->addr_len. [1] (Case of a device where dev->addr_len = 4) BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copyout+0xb8/0x100 lib/iov_iter.c:169 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copyout+0xb8/0x100 lib/iov_iter.c:169 _copy_to_iter+0x6d8/0x1d00 lib/iov_iter.c:536 copy_to_iter include/linux/uio.h:206 [inline] simple_copy_to_iter+0x68/0xa0 net/core/datagram.c:513 __skb_datagram_iter+0x123/0xdc0 net/core/datagram.c:419 skb_copy_datagram_iter+0x5c/0x200 net/core/datagram.c:527 skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline] netlink_recvmsg+0x4ae/0x15a0 net/netlink/af_netlink.c:1970 sock_recvmsg_nosec net/socket.c:1019 [inline] sock_recvmsg net/socket.c:1040 [inline] ____sys_recvmsg+0x283/0x7f0 net/socket.c:2722 ___sys_recvmsg+0x223/0x840 net/socket.c:2764 do_recvmmsg+0x4f9/0xfd0 net/socket.c:2858 __sys_recvmmsg net/socket.c:2937 [inline] __do_sys_recvmmsg net/socket.c:2960 [inline] __se_sys_recvmmsg net/socket.c:2953 [inline] __x64_sys_recvmmsg+0x397/0x490 net/socket.c:2953 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was stored to memory at: __nla_put lib/nlattr.c:1009 [inline] nla_put+0x1c6/0x230 lib/nlattr.c:1067 nlmsg_populate_fdb_fill+0x2b8/0x600 net/core/rtnetlink.c:4071 nlmsg_populate_fdb net/core/rtnetlink.c:4418 [inline] ndo_dflt_fdb_dump+0x616/0x840 net/core/rtnetlink.c:4456 rtnl_fdb_dump+0x14ff/0x1fc0 net/core/rtnetlink.c:4629 netlink_dump+0x9d1/0x1310 net/netlink/af_netlink.c:2268 netlink_recvmsg+0xc5c/0x15a0 net/netlink/af_netlink.c:1995 sock_recvmsg_nosec+0x7a/0x120 net/socket.c:1019 ____sys_recvmsg+0x664/0x7f0 net/socket.c:2720 ___sys_recvmsg+0x223/0x840 net/socket.c:2764 do_recvmmsg+0x4f9/0xfd0 net/socket.c:2858 __sys_recvmmsg net/socket.c:2937 [inline] __do_sys_recvmmsg net/socket.c:2960 [inline] __se_sys_recvmmsg net/socket.c:2953 [inline] __x64_sys_recvmmsg+0x397/0x490 net/socket.c:2953 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook+0x12d/0xb60 mm/slab.h:716 slab_alloc_node mm/slub.c:3451 [inline] __kmem_cache_alloc_node+0x4ff/0x8b0 mm/slub.c:3490 kmalloc_trace+0x51/0x200 mm/slab_common.c:1057 kmalloc include/linux/slab.h:559 [inline] __hw_addr_create net/core/dev_addr_lists.c:60 [inline] __hw_addr_add_ex+0x2e5/0x9e0 net/core/dev_addr_lists.c:118 __dev_mc_add net/core/dev_addr_lists.c:867 [inline] dev_mc_add+0x9a/0x130 net/core/dev_addr_lists.c:885 igmp6_group_added+0x267/0xbc0 net/ipv6/mcast.c:680 ipv6_mc_up+0x296/0x3b0 net/ipv6/mcast.c:2754 ipv6_mc_remap+0x1e/0x30 net/ipv6/mcast.c:2708 addrconf_type_change net/ipv6/addrconf.c:3731 [inline] addrconf_notify+0x4d3/0x1d90 net/ipv6/addrconf.c:3699 notifier_call_chain kernel/notifier.c:93 [inline] raw_notifier_call_chain+0xe4/0x430 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1935 [inline] call_netdevice_notifiers_extack net/core/dev.c:1973 [inline] call_netdevice_notifiers+0x1ee/0x2d0 net/core/dev.c:1987 bond_enslave+0xccd/0x53f0 drivers/net/bonding/bond_main.c:1906 do_set_master net/core/rtnetlink.c:2626 [inline] rtnl_newlink_create net/core/rtnetlink.c:3460 [inline] __rtnl_newlink net/core/rtnetlink.c:3660 [inline] rtnl_newlink+0x378c/0x40e0 net/core/rtnetlink.c:3673 rtnetlink_rcv_msg+0x16a6/0x1840 net/core/rtnetlink.c:6395 netlink_rcv_skb+0x371/0x650 net/netlink/af_netlink.c:2546 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6413 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0xf28/0x1230 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x122f/0x13d0 net/netlink/af_netlink.c:1913 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] ____sys_sendmsg+0x999/0xd50 net/socket.c:2503 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2557 __sys_sendmsg net/socket.c:2586 [inline] __do_sys_sendmsg net/socket.c:2595 [inline] __se_sys_sendmsg net/socket.c:2593 [inline] __x64_sys_sendmsg+0x304/0x490 net/socket.c:2593 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Bytes 2856-2857 of 3500 are uninitialized Memory access of size 3500 starts at ffff888018d99104 Data copied to user address 0000000020000480 Fixes: d83b06036048 ("net: add fdb generic dump routine") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230621174720.1845040-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/core/rtnetlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2837cc03f69e..79f62517e24a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3436,7 +3436,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) @@ -3450,10 +3450,10 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, return -EMSGSIZE; } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + - nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } @@ -3465,7 +3465,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; -- Gitee From f6af13da40f7c71ecf72ee5e04e8f7ef9fc827eb Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Thu, 7 Sep 2023 16:14:40 +0800 Subject: [PATCH 04/29] ipvlan: Fix return value of ipvlan_queue_xmit() stable inclusion from stable-v4.19.291 commit ddb11554971aca039c98a36cb0c2cb865f41ae97 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 8a9922e7be6d042fa00f894c376473b17a162b66 ] ipvlan_queue_xmit() should return NET_XMIT_XXX, but ipvlan_xmit_mode_l2/l3() returns rx_handler_result_t or NET_RX_XXX in some cases. ipvlan_rcv_frame() will only return RX_HANDLER_CONSUMED in ipvlan_xmit_mode_l2/l3() because 'local' is true. It's equal to NET_XMIT_SUCCESS. But dev_forward_skb() can return NET_RX_SUCCESS or NET_RX_DROP, and returning NET_RX_DROP(NET_XMIT_DROP) will increase both ipvlan and ipvlan->phy_dev drops counter. The skb to forward can be treated as xmitted successfully. This patch makes ipvlan_queue_xmit() return NET_XMIT_SUCCESS for forward skb. Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Cambda Zhu Link: https://lore.kernel.org/r/20230626093347.7492-1-cambda@linux.alibaba.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- drivers/net/ipvlan/ipvlan_core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 32dca624356f..9bb096e222d3 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -708,7 +708,8 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } - return ipvlan_rcv_frame(addr, &skb, true); + ipvlan_rcv_frame(addr, &skb, true); + return NET_XMIT_SUCCESS; } } out: @@ -734,7 +735,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } - return ipvlan_rcv_frame(addr, &skb, true); + ipvlan_rcv_frame(addr, &skb, true); + return NET_XMIT_SUCCESS; } } skb = skb_share_check(skb, GFP_ATOMIC); @@ -746,7 +748,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) * the skb for the main-dev. At the RX side we just return * RX_PASS for it to be processed further on the stack. */ - return dev_forward_skb(ipvlan->phy_dev, skb); + dev_forward_skb(ipvlan->phy_dev, skb); + return NET_XMIT_SUCCESS; } else if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); -- Gitee From c8adf4c0277b582bf3abf52892e66bf9005aa338 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 7 Sep 2023 16:14:41 +0800 Subject: [PATCH 05/29] net: bridge: keep ports without IFF_UNICAST_FLT in BR_PROMISC mode stable inclusion from stable-v4.19.291 commit 8fa6db24bdc97206bc59fba0b943d1319005d21d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 6ca3c005d0604e8d2b439366e3923ea58db99641 ] According to the synchronization rules for .ndo_get_stats() as seen in Documentation/networking/netdevices.rst, acquiring a plain spin_lock() should not be illegal, but the bridge driver implementation makes it so. After running these commands, I am being faced with the following lockdep splat: $ ip link add link swp0 name macsec0 type macsec encrypt on && ip link set swp0 up $ ip link add dev br0 type bridge vlan_filtering 1 && ip link set br0 up $ ip link set macsec0 master br0 && ip link set macsec0 up ======================================================== WARNING: possible irq lock inversion dependency detected 6.4.0-04295-g31b577b4bd4a #603 Not tainted -------------------------------------------------------- swapper/1/0 just changed the state of lock: ffff6bd348724cd8 (&br->lock){+.-.}-{3:3}, at: br_forward_delay_timer_expired+0x34/0x198 but this lock took another, SOFTIRQ-unsafe lock in the past: (&ocelot->stats_lock){+.+.}-{3:3} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Chain exists of: &br->lock --> &br->hash_lock --> &ocelot->stats_lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ocelot->stats_lock); local_irq_disable(); lock(&br->lock); lock(&br->hash_lock); lock(&br->lock); *** DEADLOCK *** (details about the 3 locks skipped) swp0 is instantiated by drivers/net/dsa/ocelot/felix.c, and this only matters to the extent that its .ndo_get_stats64() method calls spin_lock(&ocelot->stats_lock). Documentation/locking/lockdep-design.rst says: | A lock is irq-safe means it was ever used in an irq context, while a lock | is irq-unsafe means it was ever acquired with irq enabled. (...) | Furthermore, the following usage based lock dependencies are not allowed | between any two lock-classes:: | | -> | -> Lockdep marks br->hash_lock as softirq-safe, because it is sometimes taken in softirq context (for example br_fdb_update() which runs in NET_RX softirq), and when it's not in softirq context it blocks softirqs by using spin_lock_bh(). Lockdep marks ocelot->stats_lock as softirq-unsafe, because it never blocks softirqs from running, and it is never taken from softirq context. So it can always be interrupted by softirqs. There is a call path through which a function that holds br->hash_lock: fdb_add_hw_addr() will call a function that acquires ocelot->stats_lock: ocelot_port_get_stats64(). This can be seen below: ocelot_port_get_stats64+0x3c/0x1e0 felix_get_stats64+0x20/0x38 dsa_slave_get_stats64+0x3c/0x60 dev_get_stats+0x74/0x2c8 rtnl_fill_stats+0x4c/0x150 rtnl_fill_ifinfo+0x5cc/0x7b8 rtmsg_ifinfo_build_skb+0xe4/0x150 rtmsg_ifinfo+0x5c/0xb0 __dev_notify_flags+0x58/0x200 __dev_set_promiscuity+0xa0/0x1f8 dev_set_promiscuity+0x30/0x70 macsec_dev_change_rx_flags+0x68/0x88 __dev_set_promiscuity+0x1a8/0x1f8 __dev_set_rx_mode+0x74/0xa8 dev_uc_add+0x74/0xa0 fdb_add_hw_addr+0x68/0xd8 fdb_add_local+0xc4/0x110 br_fdb_add_local+0x54/0x88 br_add_if+0x338/0x4a0 br_add_slave+0x20/0x38 do_setlink+0x3a4/0xcb8 rtnl_newlink+0x758/0x9d0 rtnetlink_rcv_msg+0x2f0/0x550 netlink_rcv_skb+0x128/0x148 rtnetlink_rcv+0x24/0x38 the plain English explanation for it is: The macsec0 bridge port is created without p->flags & BR_PROMISC, because it is what br_manage_promisc() decides for a VLAN filtering bridge with a single auto port. As part of the br_add_if() procedure, br_fdb_add_local() is called for the MAC address of the device, and this results in a call to dev_uc_add() for macsec0 while the softirq-safe br->hash_lock is taken. Because macsec0 does not have IFF_UNICAST_FLT, dev_uc_add() ends up calling __dev_set_promiscuity() for macsec0, which is propagated by its implementation, macsec_dev_change_rx_flags(), to the lower device: swp0. This triggers the call path: dev_set_promiscuity(swp0) -> rtmsg_ifinfo() -> dev_get_stats() -> ocelot_port_get_stats64() with a calling context that lockdep doesn't like (br->hash_lock held). Normally we don't see this, because even though many drivers that can be bridge ports don't support IFF_UNICAST_FLT, we need a driver that (a) doesn't support IFF_UNICAST_FLT, *and* (b) it forwards the IFF_PROMISC flag to another driver, and (c) *that* driver implements ndo_get_stats64() using a softirq-unsafe spinlock. Condition (b) is necessary because the first __dev_set_rx_mode() calls __dev_set_promiscuity() with "bool notify=false", and thus, the rtmsg_ifinfo() code path won't be entered. The same criteria also hold true for DSA switches which don't report IFF_UNICAST_FLT. When the DSA master uses a spin_lock() in its ndo_get_stats64() method, the same lockdep splat can be seen. I think the deadlock possibility is real, even though I didn't reproduce it, and I'm thinking of the following situation to support that claim: fdb_add_hw_addr() runs on a CPU A, in a context with softirqs locally disabled and br->hash_lock held, and may end up attempting to acquire ocelot->stats_lock. In parallel, ocelot->stats_lock is currently held by a thread B (say, ocelot_check_stats_work()), which is interrupted while holding it by a softirq which attempts to lock br->hash_lock. Thread B cannot make progress because br->hash_lock is held by A. Whereas thread A cannot make progress because ocelot->stats_lock is held by B. When taking the issue at face value, the bridge can avoid that problem by simply making the ports promiscuous from a code path with a saner calling context (br->hash_lock not held). A bridge port without IFF_UNICAST_FLT is going to become promiscuous as soon as we call dev_uc_add() on it (which we do unconditionally), so why not be preemptive and make it promiscuous right from the beginning, so as to not be taken by surprise. With this, we've broken the links between code that holds br->hash_lock or br->lock and code that calls into the ndo_change_rx_flags() or ndo_get_stats64() ops of the bridge port. Fixes: 2796d0c648c9 ("bridge: Automatically manage port promiscuous mode.") Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/bridge/br_if.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b5fb2b682e19..ab539551b7d3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -161,8 +161,9 @@ void br_manage_promisc(struct net_bridge *br) * This lets us disable promiscuous mode and write * this config to hw. */ - if (br->auto_cnt == 0 || - (br->auto_cnt == 1 && br_auto_port(p))) + if ((p->dev->priv_flags & IFF_UNICAST_FLT) && + (br->auto_cnt == 0 || + (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); -- Gitee From 398cecdc74c02415488c1f37058e449087947222 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:42 +0800 Subject: [PATCH 06/29] tcp: annotate data races in __tcp_oow_rate_limited() stable inclusion from stable-v4.19.291 commit 8ec0a308091e5411812564f4679c620b5f515fb5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 998127cdb4699b9d470a9348ffe9f1154346be5f ] request sockets are lockless, __tcp_oow_rate_limited() could be called on the same object from different cpus. This is harmless. Add READ_ONCE()/WRITE_ONCE() annotations to avoid a KCSAN report. Fixes: 4ce7e93cb3fe ("tcp: rate limit ACK sent by SYN_RECV request sockets") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6272d5a44338..eb4a9ee09dbc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3432,8 +3432,11 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); + /* Paired with the WRITE_ONCE() in this function. */ + u32 val = READ_ONCE(*last_oow_ack_time); + + if (val) { + s32 elapsed = (s32)(tcp_jiffies32 - val); if (0 <= elapsed && elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { @@ -3442,7 +3445,10 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, } } - *last_oow_ack_time = tcp_jiffies32; + /* Paired with the prior READ_ONCE() and with itself, + * as we might be lockless. + */ + WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); return false; /* not rate-limited: go ahead, send dupack now! */ } -- Gitee From df8f351da04ba65ea8c75cd322fabfeb30a9e052 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Thu, 7 Sep 2023 16:14:43 +0800 Subject: [PATCH 07/29] netfilter: conntrack: Avoid nf_ct_helper_hash uses after free stable inclusion from stable-v4.19.291 commit 00716f25f9697d02a0d9bd622575c7c7321ba3d0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit 6eef7a2b933885a17679eb8ed0796ddf0ee5309b upstream. If nf_conntrack_init_start() fails (for example due to a register_nf_conntrack_bpf() failure), the nf_conntrack_helper_fini() clean-up path frees the nf_ct_helper_hash map. When built with NF_CONNTRACK=y, further netfilter modules (e.g: netfilter_conntrack_ftp) can still be loaded and call nf_conntrack_helpers_register(), independently of whether nf_conntrack initialized correctly. This accesses the nf_ct_helper_hash dangling pointer and causes a uaf, possibly leading to random memory corruption. This patch guards nf_conntrack_helper_register() from accessing a freed or uninitialized nf_ct_helper_hash pointer and fixes possible uses-after-free when loading a conntrack module. Cc: stable@vger.kernel.org Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Florent Revest Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/netfilter/nf_conntrack_helper.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e24b762ffa1d..06c70d4584cf 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -400,6 +400,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (!nf_ct_helper_hash) + return -ENOENT; + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; @@ -570,4 +573,5 @@ void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); kvfree(nf_ct_helper_hash); + nf_ct_helper_hash = NULL; } -- Gitee From 5d933ad3d43b559604aa83afbf2d4fb0093e2f93 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Thu, 7 Sep 2023 16:14:44 +0800 Subject: [PATCH 08/29] vrf: Increment Icmp6InMsgs on the original netdev stable inclusion from stable-v4.19.291 commit a24963500a4ec921ce9c2597e0a584e44513afae category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit e1ae5c2ea4783b1fd87be250f9fcc9d9e1a6ba3f ] Get the ingress interface and increment ICMP counters based on that instead of skb->dev when the the dev is a VRF device. This is a follow up on the following message: https://www.spinics.net/lists/netdev/msg560268.html v2: Avoid changing skb->dev since it has unintended effect for local delivery (David Ahern). Signed-off-by: Stephen Suryaputra Reviewed-by: David Ahern Signed-off-by: David S. Miller Stable-dep-of: 2aaa8a15de73 ("icmp6: Fix null-ptr-deref of ip6_null_entry->rt6i_idev in icmp6_dev().") Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/net/addrconf.h | 16 ++++++++++++++++ net/ipv6/icmp.c | 17 +++++++++++------ net/ipv6/reassembly.c | 4 ++-- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index db2a87981dd4..9583d3bbab03 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -340,6 +340,22 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +/** + * __in6_dev_stats_get - get inet6_dev pointer for stats + * @dev: network device + * @skb: skb for original incoming interface if neeeded + * + * Caller must hold rcu_read_lock or RTNL, because this function + * does not take a reference on the inet6_dev. + */ +static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, + const struct sk_buff *skb) +{ + if (netif_is_l3_master(dev)) + dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); + return __in6_dev_get(dev); +} + /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index fbc8746371b6..1b86a2e03d04 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -395,23 +395,28 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, return ERR_PTR(err); } -static int icmp6_iif(const struct sk_buff *skb) +static struct net_device *icmp6_dev(const struct sk_buff *skb) { - int iif = skb->dev->ifindex; + struct net_device *dev = skb->dev; /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so * get the real device index. Same is needed for replies to a link * local address on a device enslaved to an L3 master device */ - if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { + if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); if (rt6) - iif = rt6->rt6i_idev->dev->ifindex; + dev = rt6->rt6i_idev->dev; } - return iif; + return dev; +} + +static int icmp6_iif(const struct sk_buff *skb) +{ + return icmp6_dev(skb)->ifindex; } /* @@ -800,7 +805,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) static int icmpv6_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); - struct net_device *dev = skb->dev; + struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 60dfd0d11851..b596727f0497 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -302,7 +302,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_network_header_len(skb)); rcu_read_lock(); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; fq->q.rb_fragments = RB_ROOT; @@ -317,7 +317,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); inet_frag_kill(&fq->q); return -1; -- Gitee From cb95fae220f11c6bad07f4a3631324e1742bde08 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Sep 2023 16:14:45 +0800 Subject: [PATCH 09/29] icmp6: Fix null-ptr-deref of ip6_null_entry->rt6i_idev in icmp6_dev(). stable inclusion from stable-v4.19.291 commit 8803c59fde4dd370a627dfbf7183682fa0cabf70 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 2aaa8a15de73874847d62eb595c6683bface80fd ] With some IPv6 Ext Hdr (RPL, SRv6, etc.), we can send a packet that has the link-local address as src and dst IP and will be forwarded to an external IP in the IPv6 Ext Hdr. For example, the script below generates a packet whose src IP is the link-local address and dst is updated to 11::. # for f in $(find /proc/sys/net/ -name *seg6_enabled*); do echo 1 > $f; done # python3 >>> from socket import * >>> from scapy.all import * >>> >>> SRC_ADDR = DST_ADDR = "fe80::5054:ff:fe12:3456" >>> >>> pkt = IPv6(src=SRC_ADDR, dst=DST_ADDR) >>> pkt /= IPv6ExtHdrSegmentRouting(type=4, addresses=["11::", "22::"], segleft=1) >>> >>> sk = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW) >>> sk.sendto(bytes(pkt), (DST_ADDR, 0)) For such a packet, we call ip6_route_input() to look up a route for the next destination in these three functions depending on the header type. * ipv6_rthdr_rcv() * ipv6_rpl_srh_rcv() * ipv6_srh_rcv() If no route is found, ip6_null_entry is set to skb, and the following dst_input(skb) calls ip6_pkt_drop(). Finally, in icmp6_dev(), we dereference skb_rt6_info(skb)->rt6i_idev->dev as the input device is the loopback interface. Then, we have to check if skb_rt6_info(skb)->rt6i_idev is NULL or not to avoid NULL pointer deref for ip6_null_entry. BUG: kernel NULL pointer dereference, address: 0000000000000000 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 157 Comm: python3 Not tainted 6.4.0-11996-gb121d614371c #35 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:icmp6_send (net/ipv6/icmp.c:436 net/ipv6/icmp.c:503) Code: fe ff ff 48 c7 40 30 c0 86 5d 83 e8 c6 44 1c 00 e9 c8 fc ff ff 49 8b 46 58 48 83 e0 fe 0f 84 4a fb ff ff 48 8b 80 d0 00 00 00 <48> 8b 00 44 8b 88 e0 00 00 00 e9 34 fb ff ff 4d 85 ed 0f 85 69 01 RSP: 0018:ffffc90000003c70 EFLAGS: 00000286 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00000000000000e0 RDX: 0000000000000021 RSI: 0000000000000000 RDI: ffff888006d72a18 RBP: ffffc90000003d80 R08: 0000000000000000 R09: 0000000000000001 R10: ffffc90000003d98 R11: 0000000000000040 R12: ffff888006d72a10 R13: 0000000000000000 R14: ffff8880057fb800 R15: ffffffff835d86c0 FS: 00007f9dc72ee740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000057b2000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ip6_pkt_drop (net/ipv6/route.c:4513) ipv6_rthdr_rcv (net/ipv6/exthdrs.c:640 net/ipv6/exthdrs.c:686) ip6_protocol_deliver_rcu (net/ipv6/ip6_input.c:437 (discriminator 5)) ip6_input_finish (./include/linux/rcupdate.h:781 net/ipv6/ip6_input.c:483) __netif_receive_skb_one_core (net/core/dev.c:5455) process_backlog (./include/linux/rcupdate.h:781 net/core/dev.c:5895) __napi_poll (net/core/dev.c:6460) net_rx_action (net/core/dev.c:6529 net/core/dev.c:6660) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:554) do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) __local_bh_enable_ip (kernel/softirq.c:381) __dev_queue_xmit (net/core/dev.c:4231) ip6_finish_output2 (./include/net/neighbour.h:544 net/ipv6/ip6_output.c:135) rawv6_sendmsg (./include/net/dst.h:458 ./include/linux/netfilter.h:303 net/ipv6/raw.c:656 net/ipv6/raw.c:914) sock_sendmsg (net/socket.c:725 net/socket.c:748) __sys_sendto (net/socket.c:2134) __x64_sys_sendto (net/socket.c:2146 net/socket.c:2142 net/socket.c:2142) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f9dc751baea Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 RSP: 002b:00007ffe98712c38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007ffe98712cf8 RCX: 00007f9dc751baea RDX: 0000000000000060 RSI: 00007f9dc6460b90 RDI: 0000000000000003 RBP: 00007f9dc56e8be0 R08: 00007ffe98712d70 R09: 000000000000001c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007f9dc6af5d1b Modules linked in: CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:icmp6_send (net/ipv6/icmp.c:436 net/ipv6/icmp.c:503) Code: fe ff ff 48 c7 40 30 c0 86 5d 83 e8 c6 44 1c 00 e9 c8 fc ff ff 49 8b 46 58 48 83 e0 fe 0f 84 4a fb ff ff 48 8b 80 d0 00 00 00 <48> 8b 00 44 8b 88 e0 00 00 00 e9 34 fb ff ff 4d 85 ed 0f 85 69 01 RSP: 0018:ffffc90000003c70 EFLAGS: 00000286 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00000000000000e0 RDX: 0000000000000021 RSI: 0000000000000000 RDI: ffff888006d72a18 RBP: ffffc90000003d80 R08: 0000000000000000 R09: 0000000000000001 R10: ffffc90000003d98 R11: 0000000000000040 R12: ffff888006d72a10 R13: 0000000000000000 R14: ffff8880057fb800 R15: ffffffff835d86c0 FS: 00007f9dc72ee740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000057b2000 CR4: 00000000007506f0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") Reported-by: Wang Yufen Closes: https://lore.kernel.org/netdev/c41403a9-c2f6-3b7e-0c96-e1901e605cd0@huawei.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv6/icmp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1b86a2e03d04..bfafd7649ccb 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -407,7 +407,10 @@ static struct net_device *icmp6_dev(const struct sk_buff *skb) if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); - if (rt6) + /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), + * and ip6_null_entry could be set to skb if no route is found. + */ + if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } -- Gitee From 1514d6f723f0e3d62abc94305e027c06f6c10cbd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:46 +0800 Subject: [PATCH 10/29] udp6: fix udp6_ehashfn() typo stable inclusion from stable-v4.19.291 commit 22618980d6a67863b9bfeec0b679a3bdae3da604 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 51d03e2f2203e76ed02d33fb5ffbb5fc85ffaf54 ] Amit Klein reported that udp6_ehash_secret was initialized but never used. Fixes: 1bbdceef1e53 ("inet: convert inet_ehash_secret and ipv6_hash_secret to net_get_random_once") Reported-by: Amit Klein Signed-off-by: Eric Dumazet Cc: Willy Tarreau Cc: Willem de Bruijn Cc: David Ahern Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4d8a61eac755..a6e332200df2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -99,7 +99,7 @@ static u32 udp6_ehashfn(const struct net *net, fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, - udp_ipv6_hash_secret + net_hash_mix(net)); + udp6_ehash_secret + net_hash_mix(net)); } int udp_v6_get_port(struct sock *sk, unsigned short snum) -- Gitee From 141e1576530615a0e6847fc7052b0b156ee778ef Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 7 Sep 2023 16:14:47 +0800 Subject: [PATCH 11/29] net/sched: make psched_mtu() RTNL-less safe stable inclusion from stable-v4.19.291 commit fbbf4dfe5d6a0fcb7a9a0aa2371e083767b060d5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 150e33e62c1fa4af5aaab02776b6c3812711d478 ] Eric Dumazet says[1]: ------- Speaking of psched_mtu(), I see that net/sched/sch_pie.c is using it without holding RTNL, so dev->mtu can be changed underneath. KCSAN could issue a warning. ------- Annotate dev->mtu with READ_ONCE() so KCSAN don't issue a warning. [1] https://lore.kernel.org/all/CANn89iJoJO5VtaJ-2=_d2aOQhb0Xw8iBT_Cxqp2HyuS-zj6azw@mail.gmail.com/ v1 -> v2: Fix commit message Fixes: d4b36210c2e6 ("net: pkt_sched: PIE AQM scheme") Suggested-by: Eric Dumazet Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230711021634.561598-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/net/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 1a6ac924266d..7674cd6899b6 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -129,7 +129,7 @@ static inline void qdisc_run(struct Qdisc *q) */ static inline unsigned int psched_mtu(const struct net_device *dev) { - return dev->mtu + dev->hard_header_len; + return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) -- Gitee From a1e5d841869e7ee025abb9edb483826238d56f75 Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Thu, 7 Sep 2023 16:14:48 +0800 Subject: [PATCH 12/29] SUNRPC: Fix UAF in svc_tcp_listen_data_ready() stable inclusion from stable-v4.19.291 commit dfc896c4a75cb8cd7cb2dfd9b469cf1e3f004254 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit fc80fc2d4e39137869da3150ee169b40bf879287 upstream. After the listener svc_sock is freed, and before invoking svc_tcp_accept() for the established child sock, there is a window that the newsock retaining a freed listener svc_sock in sk_user_data which cloning from parent. In the race window, if data is received on the newsock, we will observe use-after-free report in svc_tcp_listen_data_ready(). Reproduce by two tasks: 1. while :; do rpc.nfsd 0 ; rpc.nfsd; done 2. while :; do echo "" | ncat -4 127.0.0.1 2049 ; done KASAN report: ================================================================== BUG: KASAN: slab-use-after-free in svc_tcp_listen_data_ready+0x1cf/0x1f0 [sunrpc] Read of size 8 at addr ffff888139d96228 by task nc/102553 CPU: 7 PID: 102553 Comm: nc Not tainted 6.3.0+ #18 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Call Trace: dump_stack_lvl+0x33/0x50 print_address_description.constprop.0+0x27/0x310 print_report+0x3e/0x70 kasan_report+0xae/0xe0 svc_tcp_listen_data_ready+0x1cf/0x1f0 [sunrpc] tcp_data_queue+0x9f4/0x20e0 tcp_rcv_established+0x666/0x1f60 tcp_v4_do_rcv+0x51c/0x850 tcp_v4_rcv+0x23fc/0x2e80 ip_protocol_deliver_rcu+0x62/0x300 ip_local_deliver_finish+0x267/0x350 ip_local_deliver+0x18b/0x2d0 ip_rcv+0x2fb/0x370 __netif_receive_skb_one_core+0x166/0x1b0 process_backlog+0x24c/0x5e0 __napi_poll+0xa2/0x500 net_rx_action+0x854/0xc90 __do_softirq+0x1bb/0x5de do_softirq+0xcb/0x100 ... Allocated by task 102371: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x7b/0x90 svc_setup_socket+0x52/0x4f0 [sunrpc] svc_addsock+0x20d/0x400 [sunrpc] __write_ports_addfd+0x209/0x390 [nfsd] write_ports+0x239/0x2c0 [nfsd] nfsctl_transaction_write+0xac/0x110 [nfsd] vfs_write+0x1c3/0xae0 ksys_write+0xed/0x1c0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Freed by task 102551: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x50 __kasan_slab_free+0x106/0x190 __kmem_cache_free+0x133/0x270 svc_xprt_free+0x1e2/0x350 [sunrpc] svc_xprt_destroy_all+0x25a/0x440 [sunrpc] nfsd_put+0x125/0x240 [nfsd] nfsd_svc+0x2cb/0x3c0 [nfsd] write_threads+0x1ac/0x2a0 [nfsd] nfsctl_transaction_write+0xac/0x110 [nfsd] vfs_write+0x1c3/0xae0 ksys_write+0xed/0x1c0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Fix the UAF by simply doing nothing in svc_tcp_listen_data_ready() if state != TCP_LISTEN, that will avoid dereferencing svsk for all child socket. Link: https://lore.kernel.org/lkml/20230507091131.23540-1-dinghui@sangfor.com.cn/ Fixes: fa9251afc33c ("SUNRPC: Call the default socket callbacks instead of open coding") Signed-off-by: Ding Hui Cc: Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/sunrpc/svcsock.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d0b5a1c47a32..b5ee21d5d1f3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -757,12 +757,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } - /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -771,15 +765,20 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } else - printk("svc: socket %p: no user data\n", sk); - } + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } else + printk("svc: socket %p: no user data\n", sk); } /* -- Gitee From cb72576729f4a74ce5b3832f2ca18023fe27e2ef Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Thu, 7 Sep 2023 16:14:49 +0800 Subject: [PATCH 13/29] net: Replace the limit of TCP_LINGER2 with TCP_FIN_TIMEOUT_MAX stable inclusion from stable-v4.19.291 commit 457202c473b933dfcee9ea8f45fff18e9f8d448f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit f0628c524fd188c3f9418e12478dfdfadacba815 ] This patch changes the behavior of TCP_LINGER2 about its limit. The sysctl_tcp_fin_timeout used to be the limit of TCP_LINGER2 but now it's only the default value. A new macro named TCP_FIN_TIMEOUT_MAX is added as the limit of TCP_LINGER2, which is 2 minutes. Since TCP_LINGER2 used sysctl_tcp_fin_timeout as the default value and the limit in the past, the system administrator cannot set the default value for most of sockets and let some sockets have a greater timeout. It might be a mistake that let the sysctl to be the limit of the TCP_LINGER2. Maybe we can add a new sysctl to set the max of TCP_LINGER2, but FIN-WAIT-2 timeout is usually no need to be too long and 2 minutes are legal considering TCP specs. Changes in v3: - Remove the new socket option and change the TCP_LINGER2 behavior so that the timeout can be set to value between sysctl_tcp_fin_timeout and 2 minutes. Changes in v2: - Add int overflow check for the new socket option. Changes in v1: - Add a new socket option to set timeout greater than sysctl_tcp_fin_timeout. Signed-off-by: Cambda Zhu Signed-off-by: David S. Miller Stable-dep-of: 9df5335ca974 ("tcp: annotate data-races around tp->linger2") Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 3abc1e200fc1..5a684d126c31 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -128,6 +128,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ +#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9a45970a6de9..77bbc756b9de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3004,8 +3004,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) - tp->linger2 = 0; + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + tp->linger2 = TCP_FIN_TIMEOUT_MAX; else tp->linger2 = val * HZ; break; -- Gitee From ea2e00ea1024990dc35287819f1e75e34fa3efbc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:50 +0800 Subject: [PATCH 14/29] tcp: annotate data-races around tp->linger2 stable inclusion from stable-v4.19.291 commit db4a513616f904b75e86cc725cf96d5ddeeb6936 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 9df5335ca974e688389c875546e5819778a80d59 ] do_tcp_getsockopt() reads tp->linger2 while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-8-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77bbc756b9de..b62b3a475252 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3003,11 +3003,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) - tp->linger2 = -1; + WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - tp->linger2 = TCP_FIN_TIMEOUT_MAX; + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else - tp->linger2 = val * HZ; + WRITE_ONCE(tp->linger2, val * HZ); break; case TCP_DEFER_ACCEPT: @@ -3405,7 +3405,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: - val = tp->linger2; + val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; -- Gitee From 64c41f2cc154ff3ff017cd198b6071c6ebdb0228 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:51 +0800 Subject: [PATCH 15/29] tcp: annotate data-races around rskq_defer_accept stable inclusion from stable-v4.19.291 commit e104ef59c608e449fa0c3aa4bb1ec59e401de5cc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit ae488c74422fb1dcd807c0201804b3b5e8a322a3 ] do_tcp_getsockopt() reads rskq_defer_accept while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-9-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b62b3a475252..294755c2e124 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3012,9 +3012,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ - icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ); + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); break; case TCP_WINDOW_CLAMP: @@ -3410,8 +3410,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: - val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); + val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; -- Gitee From 5107f4b930da5d1771ea3dd151366862c08cb028 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:52 +0800 Subject: [PATCH 16/29] tcp: annotate data-races around tp->notsent_lowat stable inclusion from stable-v4.19.291 commit 73e2b8a15ea42ccb2d4bcd26e55d4092a1a8e4e5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 1aeb87bc1440c5447a7fa2d6e3c2cca52cbd206b ] tp->notsent_lowat can be read locklessly from do_tcp_getsockopt() and tcp_poll(). Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-10-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/net/tcp.h | 6 +++++- net/ipv4/tcp.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a684d126c31..90640375e5d4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1885,7 +1885,11 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); + u32 val; + + val = READ_ONCE(tp->notsent_lowat); + + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } static inline bool tcp_stream_memory_free(const struct sock *sk) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 294755c2e124..5ada244ab8f4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3102,7 +3102,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; + WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: @@ -3573,7 +3573,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: - val = tp->notsent_lowat; + val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; -- Gitee From ab7d6d7c4551d697cab9d27f9b5bbc95f4e4aaab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:53 +0800 Subject: [PATCH 17/29] tcp: annotate data-races around fastopenq.max_qlen stable inclusion from stable-v4.19.291 commit cc4122c81d211307c7cd2ddf7455ee171d867983 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 70f360dd7042cb843635ece9d28335a4addff9eb ] This field can be read locklessly. Fixes: 1536e2857bd3 ("tcp: Add a TCP_FASTOPEN socket option to get a max backlog on its listner") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-12-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 38e569967849..5c1e0f39fc44 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -460,7 +460,7 @@ static inline void fastopen_queue_tune(struct sock *sk, int backlog) struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); - queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); + WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5ada244ab8f4..01f8c40dc753 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3558,7 +3558,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - val = icsk->icsk_accept_queue.fastopenq.max_qlen; + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 54cc3b9e443c..f645c5397095 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -276,6 +276,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; + int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying @@ -288,10 +289,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; - if (fastopenq->max_qlen == 0) + max_qlen = READ_ONCE(fastopenq->max_qlen); + if (max_qlen == 0) return false; - if (fastopenq->qlen >= fastopenq->max_qlen) { + if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; -- Gitee From 1a62d5b1b0759cd4b3c8d51650c2d18a3b9b8c8b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 7 Sep 2023 16:14:54 +0800 Subject: [PATCH 18/29] bonding: reset bond's flags when down link is P2P device stable inclusion from stable-v4.19.291 commit c8859d5118bf6ec72f2e9326f2174e0b8cfbb1a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit da19a2b967cf1e2c426f50d28550d1915214a81d ] When adding a point to point downlink to the bond, we neglected to reset the bond's flags, which were still using flags like BROADCAST and MULTICAST. Consequently, this would initiate ARP/DAD for P2P downlink interfaces, such as when adding a GRE device to the bonding. To address this issue, let's reset the bond's flags for P2P interfaces. Before fix: 7: gre0@NONE: mtu 1500 qdisc noqueue master bond0 state UNKNOWN group default qlen 1000 link/gre6 2006:70:10::1 peer 2006:70:10::2 permaddr 167f:18:f188:: 8: bond0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/gre6 2006:70:10::1 brd 2006:70:10::2 inet6 fe80::200:ff:fe00:0/64 scope link valid_lft forever preferred_lft forever After fix: 7: gre0@NONE: mtu 1500 qdisc noqueue master bond2 state UNKNOWN group default qlen 1000 link/gre6 2006:70:10::1 peer 2006:70:10::2 permaddr c29e:557a:e9d9:: 8: bond0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/gre6 2006:70:10::1 peer 2006:70:10::2 inet6 fe80::1/64 scope link valid_lft forever preferred_lft forever Reported-by: Liang Li Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2221438 Fixes: 872254dd6b1f ("net/bonding: Enable bonding to enslave non ARPHRD_ETHER") Signed-off-by: Hangbin Liu Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- drivers/net/bonding/bond_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 99c792fab45a..56c16b171e35 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1145,6 +1145,11 @@ static void bond_setup_by_slave(struct net_device *bond_dev, memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); + + if (slave_dev->flags & IFF_POINTOPOINT) { + bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); + bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); + } } /* On bonding slaves other than the currently active slave, suppress -- Gitee From 6dea22cae8359b7069e5661f1d4db21222cf49c0 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 7 Sep 2023 16:14:55 +0800 Subject: [PATCH 19/29] team: reset team's flags when down link is P2P device stable inclusion from stable-v4.19.291 commit bc0898ad9048390a507c74997040870d7a488903 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit fa532bee17d15acf8bba4bc8e2062b7a093ba801 ] When adding a point to point downlink to team device, we neglected to reset the team's flags, which were still using flags like BROADCAST and MULTICAST. Consequently, this would initiate ARP/DAD for P2P downlink interfaces, such as when adding a GRE device to team device. Fix this by remove multicast/broadcast flags and add p2p and noarp flags. After removing the none ethernet interface and adding an ethernet interface to team, we need to reset team interface flags. Unlike bonding interface, team do not need restore IFF_MASTER, IFF_SLAVE flags. Reported-by: Liang Li Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2221438 Fixes: 1d76efe1577b ("team: add support for non-ethernet devices") Signed-off-by: Hangbin Liu Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- drivers/net/team/team.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index e1ced8169d65..39819a521642 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2098,6 +2098,15 @@ static void team_setup_by_port(struct net_device *dev, dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); eth_hw_addr_inherit(dev, port_dev); + + if (port_dev->flags & IFF_POINTOPOINT) { + dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); + dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); + } else if ((port_dev->flags & (IFF_BROADCAST | IFF_MULTICAST)) == + (IFF_BROADCAST | IFF_MULTICAST)) { + dev->flags |= (IFF_BROADCAST | IFF_MULTICAST); + dev->flags &= ~(IFF_POINTOPOINT | IFF_NOARP); + } } static int team_dev_type_check_change(struct net_device *dev, -- Gitee From 9c03a6cf535d7d58725dccdecbaf58e681b4a8d1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 7 Sep 2023 16:14:56 +0800 Subject: [PATCH 20/29] virtio-net: fix race between set queues and probe stable inclusion from stable-v4.19.291 commit 7fd85f3037d0fbabedc25abfea90b9bce881c533 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit 25266128fe16d5632d43ada34c847d7b8daba539 upstream. A race were found where set_channels could be called after registering but before virtnet_set_queues() in virtnet_probe(). Fixing this by moving the virtnet_set_queues() before netdevice registering. While at it, use _virtnet_set_queues() to avoid holding rtnl as the device is not even registered at that time. Cc: stable@vger.kernel.org Fixes: a220871be66f ("virtio-net: correctly enable multiqueue") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20230725072049.617289-1-jasowang@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ca84311a179f..67ab510bd3dc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3103,6 +3103,8 @@ static int virtnet_probe(struct virtio_device *vdev) } } + _virtnet_set_queues(vi, vi->curr_queue_pairs); + /* serialize netdev register + virtio_device_ready() with ndo_open() */ rtnl_lock(); @@ -3123,8 +3125,6 @@ static int virtnet_probe(struct virtio_device *vdev) goto free_unregister_netdev; } - virtnet_set_queues(vi, vi->curr_queue_pairs); - /* Assume link up if device can't report link status, otherwise get link status from config. */ netif_carrier_off(dev); -- Gitee From 2baedca6c7211de61b1dbd32e5dd4ea5cf017a6d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 7 Sep 2023 20:50:14 +0800 Subject: [PATCH 21/29] virtio-net: set queues after driver_ok stable inclusion from stable-v4.19.293 commit 834c99d99f7eaf63485ea3c5da8e8293f648b4f7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit 51b813176f098ff61bd2833f627f5319ead098a5 upstream. Commit 25266128fe16 ("virtio-net: fix race between set queues and probe") tries to fix the race between set queues and probe by calling _virtnet_set_queues() before DRIVER_OK is set. This violates virtio spec. Fixing this by setting queues after virtio_device_ready(). Note that rtnl needs to be held for userspace requests to change the number of queues. So we are serialized in this way. Fixes: 25266128fe16 ("virtio-net: fix race between set queues and probe") Reported-by: Dragos Tatulea Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 67ab510bd3dc..a34f8ca15d73 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3103,8 +3103,6 @@ static int virtnet_probe(struct virtio_device *vdev) } } - _virtnet_set_queues(vi, vi->curr_queue_pairs); - /* serialize netdev register + virtio_device_ready() with ndo_open() */ rtnl_lock(); @@ -3117,6 +3115,8 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_device_ready(vdev); + _virtnet_set_queues(vi, vi->curr_queue_pairs); + rtnl_unlock(); err = virtnet_cpu_notif_add(vi); -- Gitee From eb8917b8d417eda46658adfa02f6143d8914272c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:57 +0800 Subject: [PATCH 22/29] tcp_metrics: fix addr_same() helper stable inclusion from stable-v4.19.291 commit 685e39a4a00aa58d00f877eb388d9ed24255bf24 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit e6638094d7af6c7b9dcca05ad009e79e31b4f670 ] Because v4 and v6 families use separate inetpeer trees (respectively net->ipv4.peers and net->ipv6.peers), inetpeer_addr_cmp(a, b) assumes a & b share the same family. tcp_metrics use a common hash table, where entries can have different families. We must therefore make sure to not call inetpeer_addr_cmp() if the families do not match. Fixes: d39d14ffa24c ("net: Add helper function to compare inetpeer addresses") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230802131500.1478140-2-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4960e2b6bd7f..c3e133c0510e 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -78,7 +78,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - return inetpeer_addr_cmp(a, b) == 0; + return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { -- Gitee From baf10b5992fced2f1b1e99b698273e04322b42b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:58 +0800 Subject: [PATCH 23/29] tcp_metrics: annotate data-races around tm->tcpm_stamp stable inclusion from stable-v4.19.291 commit 754bb55456ea339c37d2aabb35d9c2d549f3c30d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 949ad62a5d5311d36fce2e14fe5fed3f936da51c ] tm->tcpm_stamp can be read or written locklessly. Add needed READ_ONCE()/WRITE_ONCE() to document this. Also constify tcpm_check_stamp() dst argument. Fixes: 51c5d0c4b169 ("tcp: Maintain dynamic metrics in local cache.") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230802131500.1478140-3-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c3e133c0510e..2d9d95559f5f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -97,7 +97,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, u32 msval; u32 val; - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) @@ -131,9 +131,15 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_check_stamp(struct tcp_metrics_block *tm, + const struct dst_entry *dst) { - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + unsigned long limit; + + if (!tm) + return; + limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; + if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } @@ -174,7 +180,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { - if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + if (time_before(READ_ONCE(tm->tcpm_stamp), + READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; @@ -431,7 +438,7 @@ void tcp_update_metrics(struct sock *sk) tp->reordering); } } - tm->tcpm_stamp = jiffies; + WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } @@ -652,7 +659,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, - jiffies - tm->tcpm_stamp, + jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; -- Gitee From 93f586d4c56e56d30667faffb7770e5b9a35e815 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:14:59 +0800 Subject: [PATCH 24/29] tcp_metrics: annotate data-races around tm->tcpm_lock stable inclusion from stable-v4.19.291 commit f58d2c5102736c2fe8af9515e31c733d51f0520c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 285ce119a3c6c4502585936650143e54c8692788 ] tm->tcpm_lock can be read or written locklessly. Add needed READ_ONCE()/WRITE_ONCE() to document this. Fixes: 51c5d0c4b169 ("tcp: Maintain dynamic metrics in local cache.") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230802131500.1478140-4-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2d9d95559f5f..2529b1e6ded0 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -59,7 +59,8 @@ static inline struct net *tm_net(struct tcp_metrics_block *tm) static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_lock & (1 << idx); + /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ + return READ_ONCE(tm->tcpm_lock) & (1 << idx); } static u32 tcp_metric_get(struct tcp_metrics_block *tm, @@ -110,7 +111,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; - tm->tcpm_lock = val; + /* Paired with READ_ONCE() in tcp_metric_locked() */ + WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; -- Gitee From 3a76fa8351e2e508d3e91a22bb7241ac445b42e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:15:00 +0800 Subject: [PATCH 25/29] tcp_metrics: annotate data-races around tm->tcpm_vals[] stable inclusion from stable-v4.19.291 commit 9267df7f26222c2d5138cf39b4477b4e2669e03c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit 8c4d04f6b443869d25e59822f7cec88d647028a9 ] tm->tcpm_vals[] values can be read or written locklessly. Add needed READ_ONCE()/WRITE_ONCE() to document this, and force use of tcp_metric_get() and tcp_metric_set() Fixes: 51c5d0c4b169 ("tcp: Maintain dynamic metrics in local cache.") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2529b1e6ded0..fa99481abce8 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -63,17 +63,19 @@ static bool tcp_metric_locked(struct tcp_metrics_block *tm, return READ_ONCE(tm->tcpm_lock) & (1 << idx); } -static u32 tcp_metric_get(struct tcp_metrics_block *tm, +static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { - return tm->tcpm_vals[idx]; + /* Paired with WRITE_ONCE() in tcp_metric_set() */ + return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { - tm->tcpm_vals[idx] = val; + /* Paired with READ_ONCE() in tcp_metric_get() */ + WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, @@ -115,13 +117,16 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); - tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); - tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; - tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); - tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); - tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + dst_metric_raw(dst, RTAX_SSTHRESH)); + tcp_metric_set(tm, TCP_METRIC_CWND, + dst_metric_raw(dst, RTAX_CWND)); + tcp_metric_set(tm, TCP_METRIC_REORDERING, + dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; @@ -672,7 +677,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { - u32 val = tm->tcpm_vals[i]; + u32 val = tcp_metric_get(tm, i); if (!val) continue; -- Gitee From b79f130d990ad228eebf6f5c4b6a669572ec7eaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:15:01 +0800 Subject: [PATCH 26/29] tcp_metrics: annotate data-races around tm->tcpm_net stable inclusion from stable-v4.19.291 commit 0e273d202f84f4fdb2f538f6dcf0dca9a5b0c200 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit d5d986ce42c71a7562d32c4e21e026b0f87befec ] tm->tcpm_net can be read or written locklessly. Instead of changing write_pnet() and read_pnet() and potentially hurt performance, add the needed READ_ONCE()/WRITE_ONCE() in tm_net() and tcpm_new(). Fixes: 849e8a0ca8d5 ("tcp_metrics: Add a field tcpm_net and verify it matches on lookup") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230802131500.1478140-6-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index fa99481abce8..dfd224979cf6 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -40,7 +40,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - possible_net_t tcpm_net; + struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -51,9 +51,10 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; -static inline struct net *tm_net(struct tcp_metrics_block *tm) +static inline struct net *tm_net(const struct tcp_metrics_block *tm) { - return read_pnet(&tm->tcpm_net); + /* Paired with the WRITE_ONCE() in tcpm_new() */ + return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, @@ -197,7 +198,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } - write_pnet(&tm->tcpm_net, net); + /* Paired with the READ_ONCE() in tm_net() */ + WRITE_ONCE(tm->tcpm_net, net); + tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; -- Gitee From ee64d05a5086fa173e1ac3a76c8050f9c0495597 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:15:02 +0800 Subject: [PATCH 27/29] tcp_metrics: fix data-race in tcpm_suck_dst() vs fastopen stable inclusion from stable-v4.19.291 commit 48f0ecf5be777bcd6b83dc0193784406364419c7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- [ Upstream commit ddf251fa2bc1d3699eec0bae6ed0bc373b8fda79 ] Whenever tcpm_new() reclaims an old entry, tcpm_suck_dst() would overwrite data that could be read from tcp_fastopen_cache_get() or tcp_metrics_fill_info(). We need to acquire fastopen_seqlock to maintain consistency. For newly allocated objects, tcpm_new() can switch to kzalloc() to avoid an extra fastopen_seqlock acquisition. Fixes: 1fe4c481ba63 ("net-tcp: Fast Open client - cookie cache") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230802131500.1478140-7-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Yongqiang Liu --- net/ipv4/tcp_metrics.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index dfd224979cf6..7bbd9125b500 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -93,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); +static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, @@ -129,11 +130,13 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, tcp_metric_set(tm, TCP_METRIC_REORDERING, dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { + write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; + write_sequnlock(&fastopen_seqlock); } } @@ -194,7 +197,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm = oldest; } else { - tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } @@ -204,7 +207,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; - tcpm_suck_dst(tm, dst, true); + tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; @@ -561,8 +564,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) return ret; } -static DEFINE_SEQLOCK(fastopen_seqlock); - void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { -- Gitee From e4aa250edf835603adf0c1f5dfca8e998cf0b95b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Sep 2023 16:15:03 +0800 Subject: [PATCH 28/29] net/packet: annotate data-races around tp->status stable inclusion from stable-v4.19.292 commit 0fb58f7ca08a1c1b08f6a150e6b2844d34e6bf2b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit 8a9896177784063d01068293caea3f74f6830ff6 upstream. Another syzbot report [1] is about tp->status lockless reads from __packet_get_status() [1] BUG: KCSAN: data-race in __packet_rcv_has_room / __packet_set_status write to 0xffff888117d7c080 of 8 bytes by interrupt on cpu 0: __packet_set_status+0x78/0xa0 net/packet/af_packet.c:407 tpacket_rcv+0x18bb/0x1a60 net/packet/af_packet.c:2483 deliver_skb net/core/dev.c:2173 [inline] __netif_receive_skb_core+0x408/0x1e80 net/core/dev.c:5337 __netif_receive_skb_one_core net/core/dev.c:5491 [inline] __netif_receive_skb+0x57/0x1b0 net/core/dev.c:5607 process_backlog+0x21f/0x380 net/core/dev.c:5935 __napi_poll+0x60/0x3b0 net/core/dev.c:6498 napi_poll net/core/dev.c:6565 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6698 __do_softirq+0xc1/0x265 kernel/softirq.c:571 invoke_softirq kernel/softirq.c:445 [inline] __irq_exit_rcu+0x57/0xa0 kernel/softirq.c:650 sysvec_apic_timer_interrupt+0x6d/0x80 arch/x86/kernel/apic/apic.c:1106 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:645 smpboot_thread_fn+0x33c/0x4a0 kernel/smpboot.c:112 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 read to 0xffff888117d7c080 of 8 bytes by interrupt on cpu 1: __packet_get_status net/packet/af_packet.c:436 [inline] packet_lookup_frame net/packet/af_packet.c:524 [inline] __tpacket_has_room net/packet/af_packet.c:1255 [inline] __packet_rcv_has_room+0x3f9/0x450 net/packet/af_packet.c:1298 tpacket_rcv+0x275/0x1a60 net/packet/af_packet.c:2285 deliver_skb net/core/dev.c:2173 [inline] dev_queue_xmit_nit+0x38a/0x5e0 net/core/dev.c:2243 xmit_one net/core/dev.c:3574 [inline] dev_hard_start_xmit+0xcf/0x3f0 net/core/dev.c:3594 __dev_queue_xmit+0xefb/0x1d10 net/core/dev.c:4244 dev_queue_xmit include/linux/netdevice.h:3088 [inline] can_send+0x4eb/0x5d0 net/can/af_can.c:276 bcm_can_tx+0x314/0x410 net/can/bcm.c:302 bcm_tx_timeout_handler+0xdb/0x260 __run_hrtimer kernel/time/hrtimer.c:1685 [inline] __hrtimer_run_queues+0x217/0x700 kernel/time/hrtimer.c:1749 hrtimer_run_softirq+0xd6/0x120 kernel/time/hrtimer.c:1766 __do_softirq+0xc1/0x265 kernel/softirq.c:571 run_ksoftirqd+0x17/0x20 kernel/softirq.c:939 smpboot_thread_fn+0x30a/0x4a0 kernel/smpboot.c:164 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0000000000000000 -> 0x0000000020000081 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 19 Comm: ksoftirqd/1 Not tainted 6.4.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230803145600.2937518-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- net/packet/af_packet.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 332b1f7a8924..4e3766a72540 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -370,18 +370,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -398,17 +400,19 @@ static int __packet_get_status(struct packet_sock *po, void *frame) smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); -- Gitee From b029583427abcd44e62ad763cace48fb5a10ff67 Mon Sep 17 00:00:00 2001 From: Andrew Kanner Date: Thu, 7 Sep 2023 16:15:04 +0800 Subject: [PATCH 29/29] drivers: net: prevent tun_build_skb() to exceed the packet size limit stable inclusion from stable-v4.19.292 commit af1f1ce2aa288cf7c553943c7984e1f7ffb73ebf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZELT CVE: NA -------------------------------- commit 59eeb232940515590de513b997539ef495faca9a upstream. Using the syzkaller repro with reduced packet size it was discovered that XDP_PACKET_HEADROOM is not checked in tun_can_build_skb(), although pad may be incremented in tun_build_skb(). This may end up with exceeding the PAGE_SIZE limit in tun_build_skb(). Jason Wang proposed to count XDP_PACKET_HEADROOM always (e.g. without rcu_access_pointer(tun->xdp_prog)) in tun_can_build_skb() since there's a window during which XDP program might be attached between tun_can_build_skb() and tun_build_skb(). Fixes: 7df13219d757 ("tun: reserve extra headroom only when XDP is set") Link: https://syzkaller.appspot.com/bug?extid=f817490f5bd20541b90a Signed-off-by: Andrew Kanner Link: https://lore.kernel.org/r/20230803185947.2379988-1-andrew.kanner@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman Signed-off-by: Yongqiang Liu --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fad852b8dc90..a6b233a01f35 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1594,7 +1594,7 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, if (zerocopy) return false; - if (SKB_DATA_ALIGN(len + TUN_RX_PAD) + + if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; -- Gitee