From 7bb6d66d86d64387b826e70ace80b52e94f5cf9e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Jun 2022 20:47:04 -0700 Subject: [PATCH 1/3] raw: use more conventional iterators ANBZ: #1930 commit ba44f8182ec299c5d1c8a72fc0fde4ec127b5a6d upstream. In order to prepare the following patch, I change raw v4 & v6 code to use more conventional iterators. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Heng Qi --- include/net/raw.h | 5 +-- include/net/rawv6.h | 6 +-- net/ipv4/raw.c | 93 ++++++++++++++------------------------- net/ipv4/raw_diag.c | 33 +++++++------- net/ipv6/raw.c | 105 ++++++++++++++++---------------------------- 5 files changed, 92 insertions(+), 150 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index c51a635671a7..6324965779ec 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -20,9 +20,8 @@ extern struct proto raw_prot; extern struct raw_hashinfo raw_v4_hashinfo; -struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, - __be32 laddr, int dif, int sdif); +bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num, + __be32 raddr, __be32 laddr, int dif, int sdif); int raw_abort(struct sock *sk, int err); void raw_icmp_error(struct sk_buff *, int, u32); diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 53d86b6055e8..c48c1298699a 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -5,9 +5,9 @@ #include extern struct raw_hashinfo raw_v6_hashinfo; -struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, - unsigned short num, const struct in6_addr *loc_addr, - const struct in6_addr *rmt_addr, int dif, int sdif); +bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num, + const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif, int sdif); int raw_abort(struct sock *sk, int err); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4899ebe569eb..8d1e8fc28cf9 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -117,24 +117,19 @@ void raw_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(raw_unhash_sk); -struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, __be32 laddr, - int dif, int sdif) +bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num, + __be32 raddr, __be32 laddr, int dif, int sdif) { - sk_for_each_from(sk) { - struct inet_sock *inet = inet_sk(sk); - - if (net_eq(sock_net(sk), net) && inet->inet_num == num && - !(inet->inet_daddr && inet->inet_daddr != raddr) && - !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) - goto found; /* gotcha */ - } - sk = NULL; -found: - return sk; + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return true; + return false; } -EXPORT_SYMBOL_GPL(__raw_v4_lookup); +EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver @@ -168,23 +163,21 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) */ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { + struct net *net = dev_net(skb->dev); int sdif = inet_sdif(skb); int dif = inet_iif(skb); - struct sock *sk; struct hlist_head *head; int delivered = 0; - struct net *net; + struct sock *sk; - read_lock(&raw_v4_hashinfo.lock); head = &raw_v4_hashinfo.ht[hash]; if (hlist_empty(head)) - goto out; - - net = dev_net(skb->dev); - sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, - iph->saddr, iph->daddr, dif, sdif); - - while (sk) { + return 0; + read_lock(&raw_v4_hashinfo.lock); + sk_for_each(sk, head) { + if (!raw_v4_match(net, sk, iph->protocol, + iph->saddr, iph->daddr, dif, sdif)) + continue; delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, @@ -195,31 +188,16 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) if (clone) raw_rcv(sk, clone); } - sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, - iph->saddr, iph->daddr, - dif, sdif); } -out: read_unlock(&raw_v4_hashinfo.lock); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { - int hash; - struct sock *raw_sk; - - hash = protocol & (RAW_HTABLE_SIZE - 1); - raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); - - /* If there maybe a raw socket we must check - if not we - * don't care less - */ - if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) - raw_sk = NULL; - - return raw_sk != NULL; + int hash = protocol & (RAW_HTABLE_SIZE - 1); + return raw_v4_input(skb, ip_hdr(skb), hash); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) @@ -286,29 +264,24 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { - int hash; - struct sock *raw_sk; + struct net *net = dev_net(skb->dev);; + int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); + struct hlist_head *head; const struct iphdr *iph; - struct net *net; + struct sock *sk; + int hash; hash = protocol & (RAW_HTABLE_SIZE - 1); + head = &raw_v4_hashinfo.ht[hash]; read_lock(&raw_v4_hashinfo.lock); - raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); - if (raw_sk) { - int dif = skb->dev->ifindex; - int sdif = inet_sdif(skb); - + sk_for_each(sk, head) { iph = (const struct iphdr *)skb->data; - net = dev_net(skb->dev); - - while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, - iph->daddr, iph->saddr, - dif, sdif)) != NULL) { - raw_err(raw_sk, skb, info); - raw_sk = sk_next(raw_sk); - iph = (const struct iphdr *)skb->data; - } + if (!raw_v4_match(net, sk, iph->protocol, + iph->saddr, iph->daddr, dif, sdif)) + continue; + raw_err(sk, skb, info); } read_unlock(&raw_v4_hashinfo.lock); } diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 1b5b8af27aaf..5d811cd02f16 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -34,31 +34,30 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r) * use helper to figure it out. */ -static struct sock *raw_lookup(struct net *net, struct sock *from, - const struct inet_diag_req_v2 *req) +static bool raw_lookup(struct net *net, struct sock *sk, + const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; - struct sock *sk = NULL; if (r->sdiag_family == AF_INET) - sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, - r->id.idiag_dst[0], - r->id.idiag_src[0], - r->id.idiag_if, 0); + return raw_v4_match(net, sk, r->sdiag_raw_protocol, + r->id.idiag_dst[0], + r->id.idiag_src[0], + r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else - sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, - (const struct in6_addr *)r->id.idiag_src, - (const struct in6_addr *)r->id.idiag_dst, - r->id.idiag_if, 0); + return raw_v6_match(net, sk, r->sdiag_raw_protocol, + (const struct in6_addr *)r->id.idiag_src, + (const struct in6_addr *)r->id.idiag_dst, + r->id.idiag_if, 0); #endif - return sk; + return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); - struct sock *sk = NULL, *s; + struct sock *sk; int slot; if (IS_ERR(hashinfo)) @@ -66,9 +65,8 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 read_lock(&hashinfo->lock); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { - sk_for_each(s, &hashinfo->ht[slot]) { - sk = raw_lookup(net, s, r); - if (sk) { + sk_for_each(sk, &hashinfo->ht[slot]) { + if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag meaage to be reported, so @@ -81,10 +79,11 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 } } } + sk = ERR_PTR(-ENOENT); out_unlock: read_unlock(&hashinfo->lock); - return sk ? sk : ERR_PTR(-ENOENT); + return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 110254f44a46..88f89bf1d985 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -66,41 +66,27 @@ struct raw_hashinfo raw_v6_hashinfo = { }; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); -struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, - unsigned short num, const struct in6_addr *loc_addr, - const struct in6_addr *rmt_addr, int dif, int sdif) +bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num, + const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif, int sdif) { - bool is_multicast = ipv6_addr_is_multicast(loc_addr); - - sk_for_each_from(sk) - if (inet_sk(sk)->inet_num == num) { - - if (!net_eq(sock_net(sk), net)) - continue; - - if (!ipv6_addr_any(&sk->sk_v6_daddr) && - !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) - continue; - - if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, - dif, sdif)) - continue; - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) - goto found; - if (is_multicast && - inet6_mc_check(sk, loc_addr, rmt_addr)) - goto found; - continue; - } - goto found; - } - sk = NULL; -found: - return sk; + if (inet_sk(sk)->inet_num != num || + !net_eq(sock_net(sk), net) || + (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || + !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif)) + return false; + + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) || + (ipv6_addr_is_multicast(loc_addr) && + inet6_mc_check(sk, loc_addr, rmt_addr))) + return true; + + return false; } -EXPORT_SYMBOL_GPL(__raw_v6_lookup); +EXPORT_SYMBOL_GPL(raw_v6_match); /* * 0 - deliver @@ -156,31 +142,28 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); */ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { + struct net *net = dev_net(skb->dev); const struct in6_addr *saddr; const struct in6_addr *daddr; + struct hlist_head *head; struct sock *sk; bool delivered = false; __u8 hash; - struct net *net; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = nexthdr & (RAW_HTABLE_SIZE - 1); - + head = &raw_v6_hashinfo.ht[hash]; + if (hlist_empty(head)) + return false; read_lock(&raw_v6_hashinfo.lock); - sk = sk_head(&raw_v6_hashinfo.ht[hash]); - - if (!sk) - goto out; - - net = dev_net(skb->dev); - sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, - inet6_iif(skb), inet6_sdif(skb)); - - while (sk) { + sk_for_each(sk, head) { int filtered; + if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, + inet6_iif(skb), inet6_sdif(skb))) + continue; delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: @@ -219,23 +202,14 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) rawv6_rcv(sk, clone); } } - sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, - inet6_iif(skb), inet6_sdif(skb)); } -out: read_unlock(&raw_v6_hashinfo.lock); return delivered; } bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { - struct sock *raw_sk; - - raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]); - if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) - raw_sk = NULL; - - return raw_sk != NULL; + return ipv6_raw_deliver(skb, nexthdr); } /* This cleans up af_inet6 a bit. -DaveM */ @@ -361,28 +335,25 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { + const struct in6_addr *saddr, *daddr; + struct net *net = dev_net(skb->dev); + struct hlist_head *head; struct sock *sk; int hash; - const struct in6_addr *saddr, *daddr; - struct net *net; hash = nexthdr & (RAW_HTABLE_SIZE - 1); - + head = &raw_v6_hashinfo.ht[hash]; read_lock(&raw_v6_hashinfo.lock); - sk = sk_head(&raw_v6_hashinfo.ht[hash]); - if (sk) { + sk_for_each(sk, head) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; saddr = &ip6h->saddr; daddr = &ip6h->daddr; - net = dev_net(skb->dev); - while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, - inet6_iif(skb), inet6_iif(skb)))) { - rawv6_err(sk, skb, NULL, type, code, - inner_offset, info); - sk = sk_next(sk); - } + if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, + inet6_iif(skb), inet6_iif(skb))) + continue; + rawv6_err(sk, skb, NULL, type, code, inner_offset, info); } read_unlock(&raw_v6_hashinfo.lock); } -- Gitee From db1a1f27875ca68c68a65e5ac2215185ae26108f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Jun 2022 20:47:05 -0700 Subject: [PATCH 2/3] raw: convert raw sockets to RCU ANBZ: #1930 commit 0daf07e527095e64ee8927ce297ab626643e9f51 upstream. Using rwlock in networking code is extremely risky. writers can starve if enough readers are constantly grabing the rwlock. I thought rwlock were at fault and sent this patch: https://lkml.org/lkml/2022/6/17/272 But Peter and Linus essentially told me rwlock had to be unfair. We need to get rid of rwlock in networking code. Without this fix, following script triggers soft lockups: for i in {1..48} do ping -f -n -q 127.0.0.1 & sleep 0.1 done Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [Fixes conflicts] Signed-off-by: Heng Qi --- include/net/raw.h | 11 +++++- include/net/rawv6.h | 1 + net/ipv4/af_inet.c | 2 ++ net/ipv4/raw.c | 83 +++++++++++++++++++++------------------------ net/ipv4/raw_diag.c | 22 +++++++----- net/ipv6/af_inet6.c | 3 ++ net/ipv6/raw.c | 28 +++++++-------- 7 files changed, 80 insertions(+), 70 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 6324965779ec..537d9d1df890 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -33,9 +33,18 @@ int raw_rcv(struct sock *, struct sk_buff *); struct raw_hashinfo { rwlock_t lock; - struct hlist_head ht[RAW_HTABLE_SIZE]; + struct hlist_nulls_head ht[RAW_HTABLE_SIZE]; }; +static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo) +{ + int i; + + rwlock_init(&hashinfo->lock); + for (i = 0; i < RAW_HTABLE_SIZE; i++) + INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i); +} + #ifdef CONFIG_PROC_FS int raw_proc_init(void); void raw_proc_exit(void); diff --git a/include/net/rawv6.h b/include/net/rawv6.h index c48c1298699a..bc70909625f6 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -3,6 +3,7 @@ #define _NET_RAWV6_H #include +#include extern struct raw_hashinfo raw_v6_hashinfo; bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 27239daf8d4f..b4e5995d4bfd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1958,6 +1958,8 @@ static int __init inet_init(void) sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); + raw_hashinfo_init(&raw_v4_hashinfo); + rc = proto_register(&tcp_prot, 1); if (rc) goto out; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8d1e8fc28cf9..d5740cf30d2c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -85,20 +85,19 @@ struct raw_frag_vec { int hlen; }; -struct raw_hashinfo raw_v4_hashinfo = { - .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), -}; +struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - struct hlist_head *head; + struct hlist_nulls_head *hlist; - head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; write_lock_bh(&h->lock); - sk_add_node(sk, head); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist); + sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); @@ -111,7 +110,7 @@ void raw_unhash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; write_lock_bh(&h->lock); - if (sk_del_node_init(sk)) + if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } @@ -164,17 +163,16 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { struct net *net = dev_net(skb->dev); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); int dif = inet_iif(skb); - struct hlist_head *head; int delivered = 0; struct sock *sk; - head = &raw_v4_hashinfo.ht[hash]; - if (hlist_empty(head)) - return 0; - read_lock(&raw_v4_hashinfo.lock); - sk_for_each(sk, head) { + hlist = &raw_v4_hashinfo.ht[hash]; + rcu_read_lock(); + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; @@ -189,7 +187,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) raw_rcv(sk, clone); } } - read_unlock(&raw_v4_hashinfo.lock); + rcu_read_unlock(); return delivered; } @@ -265,25 +263,26 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev);; + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); - struct hlist_head *head; const struct iphdr *iph; struct sock *sk; int hash; hash = protocol & (RAW_HTABLE_SIZE - 1); - head = &raw_v4_hashinfo.ht[hash]; + hlist = &raw_v4_hashinfo.ht[hash]; - read_lock(&raw_v4_hashinfo.lock); - sk_for_each(sk, head) { + rcu_read_lock(); + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; raw_err(sk, skb, info); } - read_unlock(&raw_v4_hashinfo.lock); + rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -943,44 +942,41 @@ struct proto raw_prot = { }; #ifdef CONFIG_PROC_FS -static struct sock *raw_get_first(struct seq_file *seq) +static struct sock *raw_get_first(struct seq_file *seq, int bucket) { - struct sock *sk; struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; + struct sock *sk; - for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; + for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { - sk_for_each(sk, &h->ht[state->bucket]) + hlist = &h->ht[state->bucket]; + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { if (sock_net(sk) == seq_file_net(seq)) - goto found; + return sk; + } } - sk = NULL; -found: - return sk; + return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); do { - sk = sk_next(sk); -try_again: - ; + sk = sk_nulls_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); - if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { - sk = sk_head(&h->ht[state->bucket]); - goto try_again; - } + if (!sk) + return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { - struct sock *sk = raw_get_first(seq); + struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) @@ -989,11 +985,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) } void *raw_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(&h->lock) + __acquires(RCU) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); - - read_lock(&h->lock); + rcu_read_lock(); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); @@ -1003,7 +997,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct sock *sk; if (v == SEQ_START_TOKEN) - sk = raw_get_first(seq); + sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; @@ -1012,11 +1006,9 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) - __releases(&h->lock) + __releases(RCU) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); - - read_unlock(&h->lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(raw_seq_stop); @@ -1078,6 +1070,7 @@ static __net_initdata struct pernet_operations raw_net_ops = { int __init raw_proc_init(void) { + return register_pernet_subsys(&raw_net_ops); } diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 5d811cd02f16..3ea637ce8962 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -57,31 +57,32 @@ static bool raw_lookup(struct net *net, struct sock *sk, static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); - read_lock(&hashinfo->lock); + rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { - sk_for_each(sk, &hashinfo->ht[slot]) { + hlist = &hashinfo->ht[slot]; + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill - * diag meaage to be reported, so + * diag message to be reported, so * caller should call sock_put then. - * We can do that because we're keeping - * hashinfo->lock here. */ - sock_hold(sk); - goto out_unlock; + if (refcount_inc_not_zero(&sk->sk_refcnt)) + goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: - read_unlock(&hashinfo->lock); + rcu_read_unlock(); return sk; } @@ -144,6 +145,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; int num, s_num, slot, s_slot; struct sock *sk = NULL; struct nlattr *bc; @@ -160,7 +163,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; - sk_for_each(sk, &hashinfo->ht[slot]) { + hlist = &hashinfo->ht[slot]; + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 801060be8eaf..ea40e37bbb6b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1053,6 +1054,8 @@ static int __init inet6_init(void) goto out; } + raw_hashinfo_init(&raw_v6_hashinfo); + err = proto_register(&tcpv6_prot, 1); if (err) goto out; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 88f89bf1d985..60b395d7c310 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -61,9 +61,7 @@ #define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ -struct raw_hashinfo raw_v6_hashinfo = { - .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), -}; +struct raw_hashinfo raw_v6_hashinfo; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num, @@ -143,9 +141,10 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; const struct in6_addr *saddr; const struct in6_addr *daddr; - struct hlist_head *head; struct sock *sk; bool delivered = false; __u8 hash; @@ -154,11 +153,9 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) daddr = saddr + 1; hash = nexthdr & (RAW_HTABLE_SIZE - 1); - head = &raw_v6_hashinfo.ht[hash]; - if (hlist_empty(head)) - return false; - read_lock(&raw_v6_hashinfo.lock); - sk_for_each(sk, head) { + hlist = &raw_v6_hashinfo.ht[hash]; + rcu_read_lock(); + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, @@ -203,7 +200,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) } } } - read_unlock(&raw_v6_hashinfo.lock); + rcu_read_unlock(); return delivered; } @@ -337,14 +334,15 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, { const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); - struct hlist_head *head; + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; struct sock *sk; int hash; hash = nexthdr & (RAW_HTABLE_SIZE - 1); - head = &raw_v6_hashinfo.ht[hash]; - read_lock(&raw_v6_hashinfo.lock); - sk_for_each(sk, head) { + hlist = &raw_v6_hashinfo.ht[hash]; + rcu_read_lock(); + hlist_nulls_for_each_entry(sk, hnode, hlist, sk_nulls_node) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; saddr = &ip6h->saddr; @@ -355,7 +353,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, continue; rawv6_err(sk, skb, NULL, type, code, inner_offset, info); } - read_unlock(&raw_v6_hashinfo.lock); + rcu_read_unlock(); } static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) -- Gitee From e950528f57745532257179330bbedd3c1420f4cc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 16 Sep 2022 11:48:21 +0300 Subject: [PATCH 3/3] ipv6: Fix crash when IPv6 is administratively disabled ANBZ: #1930 commit 76dd07281338da6951fdab3432ced843fa87839c upstream. The global 'raw_v6_hashinfo' variable can be accessed even when IPv6 is administratively disabled via the 'ipv6.disable=1' kernel command line option, leading to a crash [1]. Fix by restoring the original behavior and always initializing the variable, regardless of IPv6 support being administratively disabled or not. [1] BUG: unable to handle page fault for address: ffffffffffffffc8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 173e18067 P4D 173e18067 PUD 173e1a067 PMD 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 3 PID: 271 Comm: ss Not tainted 6.0.0-rc4-custom-00136-g0727a9a5fbc1 #1396 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 RIP: 0010:raw_diag_dump+0x310/0x7f0 [...] Call Trace: __inet_diag_dump+0x10f/0x2e0 netlink_dump+0x575/0xfd0 __netlink_dump_start+0x67b/0x940 inet_diag_handler_cmd+0x273/0x2d0 sock_diag_rcv_msg+0x317/0x440 netlink_rcv_skb+0x15e/0x430 sock_diag_rcv+0x2b/0x40 netlink_unicast+0x53b/0x800 netlink_sendmsg+0x945/0xe60 ____sys_sendmsg+0x747/0x960 ___sys_sendmsg+0x13a/0x1e0 __sys_sendmsg+0x118/0x1e0 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU") Reported-by: Roberto Ricci Tested-by: Roberto Ricci Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220916084821.229287-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Heng Qi --- net/ipv6/af_inet6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ea40e37bbb6b..2fef46f3cb8e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1049,13 +1049,13 @@ static int __init inet6_init(void) for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); + raw_hashinfo_init(&raw_v6_hashinfo); + if (disable_ipv6_mod) { pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } - raw_hashinfo_init(&raw_v6_hashinfo); - err = proto_register(&tcpv6_prot, 1); if (err) goto out; -- Gitee