From 4cc79a3eeb0724eeb2384a9ea1f7d71badfd59ac Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 3 Oct 2022 13:59:47 +0100 Subject: [PATCH 01/18] io_uring/af_unix: defer registered files gc to io_uring release mainline inclusion from mainline-6.1-rc1 commit 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 category: bugfix issue: I5XD8P CVE: CVE-2022-2602 Signed-off-by: gaochao --------------------------------------- Instead of putting io_uring's registered files in unix_gc() we want it to be done by io_uring itself. The trick here is to consider io_uring registered files for cycle detection but not actually putting them down. Because io_uring can't register other ring instances, this will remove all refs to the ring file triggering the ->release path and clean up with io_ring_ctx_free(). Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-and-tested-by: David Bouman Signed-off-by: Pavel Begunkov Signed-off-by: Thadeu Lima de Souza Cascardo [axboe: add kerneldoc comment to skb, fold in skb leak fix] Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + include/linux/skbuff.h | 2 ++ net/unix/garbage.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index e5eddd008651..e41956f2fc01 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7286,6 +7286,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) } skb->sk = sk; + skb->scm_io_uring = 1; nr_files = 0; fpl->user = get_uid(ctx->user); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acbf1875ad50..a77b38e738e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -681,6 +681,7 @@ typedef unsigned char *sk_buff_data_t; * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) + * @scm_io_uring: SKB holds io_uring registered files * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from @@ -858,6 +859,7 @@ struct sk_buff { #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif + __u8 scm_io_uring:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 12e2ddaf887f..bad885b2620f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -201,6 +201,7 @@ void wait_for_unix_gc(void) /* The external entry point: unix_gc() */ void unix_gc(void) { + struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -291,12 +292,31 @@ void unix_gc(void) } spin_unlock(&unix_gc_lock); + + /* We need io_uring to clean its registered files, ignore all io_uring + * originated skbs. It's fine as io_uring doesn't keep references to + * other io_uring instances and so killing all other files in the cycle + * will put all io_uring references forcing it to go through normal + * release.path eventually putting registered files. + */ + skb_queue_walk_safe(&hitlist, skb, next_skb) { + if (skb->scm_io_uring) { + __skb_unlink(skb, &hitlist); + skb_queue_tail(&skb->sk->sk_receive_queue, skb); + } + } /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); + /* There could be io_uring registered files, just push them back to + * the inflight list + */ + list_for_each_entry_safe(u, next, &gc_candidates, link) + list_move_tail(&u->link, &gc_inflight_list); + /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); gc_in_progress = false; -- Gitee From 3529f302e5903879fc61d3ced45120b65662e1b2 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 16 Aug 2022 12:08:58 +0800 Subject: [PATCH 02/18] fs: fix UAF/GPF bug in nilfs_mdt_destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-6.1-rc1 commit 2e488f13755ffbb60f307e991b27024716a33b29 category: bugfix issue: I5OHXC CVE: CVE-2022-2978 Signed-off-by: gaochao --------------------------------------- In alloc_inode, inode_init_always() could return -ENOMEM if security_inode_alloc() fails, which causes inode->i_private uninitialized. Then nilfs_is_metadata_file_inode() returns true and nilfs_free_inode() wrongly calls nilfs_mdt_destroy(), which frees the uninitialized inode->i_private and leads to crashes(e.g., UAF/GPF). Fix this by moving security_inode_alloc just prior to this_cpu_inc(nr_inodes) Link: https://lkml.kernel.org/r/CAFcO6XOcf1Jj2SeGt=jJV59wmhESeSKpfR0omdFRq+J9nD1vfQ@mail.gmail.com Reported-by: butt3rflyh4ck Reported-by: Hao Sun Reported-by: Jiacheng Xu Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Dongliang Mu Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 5eea9912a0b9..27bd9de6c7e0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -168,8 +168,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_wb_frn_history = 0; #endif - if (security_inode_alloc(inode)) - goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -202,11 +200,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; this_cpu_inc(nr_inodes); return 0; -out: - return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); -- Gitee From 678925bfd209db7cc58a085eb52e24d8caab2179 Mon Sep 17 00:00:00 2001 From: Jialiang Wang Date: Wed, 10 Aug 2022 15:30:57 +0800 Subject: [PATCH 03/18] nfp: fix use-after-free in area_cache_get() mainline inclusion from mainline-6.0-rc1 commit 02e1a114fdb71e59ee6770294166c30d437bf86a category: bugfix issue: I5XD7Q CVE: CVE-2022-3545 Signed-off-by: gaochao --------------------------------------- area_cache_get() is used to distribute cache->area and set cache->id, and if cache->id is not 0 and cache->area->kref refcount is 0, it will release the cache->area by nfp_cpp_area_release(). area_cache_get() set cache->id before cpp->op->area_init() and nfp_cpp_area_acquire(). But if area_init() or nfp_cpp_area_acquire() fails, the cache->id is is already set but the refcount is not increased as expected. At this time, calling the nfp_cpp_area_release() will cause use-after-free. To avoid the use-after-free, set cache->id after area_init() and nfp_cpp_area_acquire() complete successfully. Note: This vulnerability is triggerable by providing emulated device equipped with specified configuration. BUG: KASAN: use-after-free in nfp6000_area_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:760) Write of size 4 at addr ffff888005b7f4a0 by task swapper/0/1 Call Trace: nfp6000_area_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:760) area_cache_get.constprop.8 (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:884) Allocated by task 1: nfp_cpp_area_alloc_with_name (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:303) nfp_cpp_area_cache_add (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:802) nfp6000_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:1230) nfp_cpp_from_operations (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:1215) nfp_pci_probe (drivers/net/ethernet/netronome/nfp/nfp_main.c:744) Freed by task 1: kfree (mm/slub.c:4562) area_cache_get.constprop.8 (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:873) nfp_cpp_read (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:924 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:973) nfp_cpp_readl (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c:48) Signed-off-by: Jialiang Wang Reviewed-by: Yinjun Zhang Acked-by: Simon Horman Link: https://lore.kernel.org/r/20220810073057.4032-1-wangjialiang0806@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c index 94994a939277..1056f277a02a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c @@ -872,7 +872,6 @@ area_cache_get(struct nfp_cpp *cpp, u32 id, } /* Adjust the start address to be cache size aligned */ - cache->id = id; cache->addr = addr & ~(u64)(cache->size - 1); /* Re-init to the new ID and address */ @@ -892,6 +891,8 @@ area_cache_get(struct nfp_cpp *cpp, u32 id, return NULL; } + cache->id = id; + exit: /* Adjust offset */ *offset = addr - cache->addr; -- Gitee From bd9ec43e0ba91e62099916637313edc76818f4bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 31 Aug 2022 23:52:18 +0200 Subject: [PATCH 04/18] sch_sfb: Don't assume the skb is still around after enqueueing to child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.143 commit 2ee85ac1b29dbd2ebd2d8e5ac1dd5793235d516b category: bugfix issue: I5XEAK CVE: CVE-2022-3586 Signed-off-by: gaochao --------------------------------------- [ Upstream commit 9efd23297cca530bb35e1848665805d3fcdd7889 ] The sch_sfb enqueue() routine assumes the skb is still alive after it has been enqueued into a child qdisc, using the data in the skb cb field in the increment_qlen() routine after enqueue. However, the skb may in fact have been freed, causing a use-after-free in this case. In particular, this happens if sch_cake is used as a child of sfb, and the GSO splitting mode of CAKE is enabled (in which case the skb will be split into segments and the original skb freed). Fix this by copying the sfb cb data to the stack before enqueueing the skb, and using this stack copy in increment_qlen() instead of the skb pointer itself. Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/sch_sfb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index da047a37a3bf..f180cf95cfc9 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } } -static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; - sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); - sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -283,6 +283,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -399,11 +400,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); -- Gitee From 6ecb8829b63e7e707b2b9be0b5b90dc89473577c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 5 Sep 2022 21:21:36 +0200 Subject: [PATCH 05/18] sch_sfb: Also store skb len before calling child enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stable inclusion from stable-v5.10.143 commit 2ead78fbe6b523e6232ad286e3c13d2a410de22a category: bugfix issue: I5XEAK CVE: CVE-2022-3586 Signed-off-by: gaochao --------------------------------------- [ Upstream commit 2f09707d0c972120bf794cfe0f0c67e2c2ddb252 ] Cong Wang noticed that the previous fix for sch_sfb accessing the queued skb after enqueueing it to a child qdisc was incomplete: the SFB enqueue function was also calling qdisc_qstats_backlog_inc() after enqueue, which reads the pkt len from the skb cb field. Fix this by also storing the skb len, and using the stored value to increment the backlog after enqueueing. Fixes: 9efd23297cca ("sch_sfb: Don't assume the skb is still around after enqueueing to child") Signed-off-by: Toke Høiland-Jørgensen Acked-by: Cong Wang Link: https://lore.kernel.org/r/20220905192137.965549-1-toke@toke.dk Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/sched/sch_sfb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index f180cf95cfc9..b2724057629f 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; @@ -403,7 +404,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { -- Gitee From 3d27bf9fd7d8ae53eaacc544c898ff115c011c4e Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 2 Oct 2022 12:08:04 +0900 Subject: [PATCH 06/18] nilfs2: fix NULL pointer dereference at nilfs_bmap_lookup_at_level() mainline inclusion from mainline-6.1-rc1 commit 21a87d88c2253350e115029f14fe2a10a7e6c856 category: bugfix issue: I5YPM3 CVE: CVE-2022-3621 Signed-off-by: gaochao --------------------------------------- If the i_mode field in inode of metadata files is corrupted on disk, it can cause the initialization of bmap structure, which should have been called from nilfs_read_inode_common(), not to be called. This causes a lockdep warning followed by a NULL pointer dereference at nilfs_bmap_lookup_at_level(). This patch fixes these issues by adding a missing sanitiy check for the i_mode field of metadata file's inode. Link: https://lkml.kernel.org/r/20221002030804.29978-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+2b32eb36c1a825b7a74c@syzkaller.appspotmail.com Reported-by: Tetsuo Handa Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7068117e36b6..50849307eae6 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -454,6 +454,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */ -- Gitee From e9b5e148f3201191e420e9b7e306de7fc4528425 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 1 Sep 2022 18:41:31 +0800 Subject: [PATCH 07/18] mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page mainline inclusion from mainline-6.1-rc1 commit fac35ba763ed07ba93154c95ffc0c4a55023707f category: bugfix issue: I5XLH5 CVE: CVE-2022-3623 Signed-off-by: gaochao --------------------------------------- On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified. So when looking up a CONT-PTE size hugetlb page by follow_page(), it will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size hugetlb in follow_page_pte(). However this pte entry lock is incorrect for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get the correct lock, which is mm->page_table_lock. That means the pte entry of the CONT-PTE size hugetlb under current pte lock is unstable in follow_page_pte(), we can continue to migrate or poison the pte entry of the CONT-PTE size hugetlb, which can cause some potential race issues, even though they are under the 'pte lock'. For example, suppose thread A is trying to look up a CONT-PTE size hugetlb page by move_pages() syscall under the lock, however antoher thread B can migrate the CONT-PTE hugetlb page at the same time, which will cause thread A to get an incorrect page, if thread A also wants to do page migration, then data inconsistency error occurs. Moreover we have the same issue for CONT-PMD size hugetlb in follow_huge_pmd(). To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte() to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to get the correct pte entry lock to make the pte entry stable. Mike said: Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb: refactor find_num_contig()"). Patch series "Support for contiguous pte hugepages", v4. However, I do not believe these code paths were executed until migration support was added with 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") I would go with 5480280d3f2d for the Fixes: targe. Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.1662017562.git.baolin.wang@linux.alibaba.com Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages") Signed-off-by: Baolin Wang Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 ++++---- mm/gup.c | 14 +++++++++++++- mm/hugetlb.c | 27 +++++++++++++-------------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b9fbb6d4150e..955b19dc28a8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,8 +174,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags); +struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, + int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, @@ -261,8 +261,8 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma, return NULL; } -static inline struct page *follow_huge_pmd(struct mm_struct *mm, - unsigned long address, pmd_t *pmd, int flags) +static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, + unsigned long address, int flags) { return NULL; } diff --git a/mm/gup.c b/mm/gup.c index 6cb7d8ae56f6..fcd3f8ad81e6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -405,6 +405,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); + + /* + * Considering PTE level hugetlb, like continuous-PTE hugetlb on + * ARM64 architecture. + */ + if (is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -560,7 +572,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (pmd_none(pmdval)) return no_page_table(vma, flags); if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd(mm, address, pmd, flags); + page = follow_huge_pmd_pte(vma, address, flags); if (page) return page; return no_page_table(vma, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4ecf9a0622df..5d6e590580df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5596,12 +5596,13 @@ follow_huge_pd(struct vm_area_struct *vma, } struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) { + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; spinlock_t *ptl; - pte_t pte; + pte_t *ptep, pte; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == @@ -5609,17 +5610,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); + if (!ptep) + return NULL; + + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(ptep); if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pte_page(pte) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); /* * try_grab_page() should always succeed here, because: a) we * hold the pmd (ptl) lock, and b) we've just checked that the @@ -5635,7 +5634,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait(mm, ptep, ptl); goto retry; } /* -- Gitee From e8d555d6aa000aee3d127dbfa9e62cfe9db6d7c9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Aug 2022 14:35:06 +0300 Subject: [PATCH 08/18] devlink: Fix use-after-free after a failed reload stable inclusion from stable-5.10.138 commit 0e28678a770df7989108327cfe86f835d8760c33 category: bugfix issue: I5XV72 CVE: CVE-2022-3625 Signed-off-by: gaochao --------------------------------------- commit 6b4db2e528f650c7fb712961aac36455468d5902 upstream. After a failed devlink reload, devlink parameters are still registered, which means user space can set and get their values. In the case of the mlxsw "acl_region_rehash_interval" parameter, these operations will trigger a use-after-free [1]. Fix this by rejecting set and get operations while in the failed state. Return the "-EOPNOTSUPP" error code which does not abort the parameters dump, but instead causes it to skip over the problematic parameter. Another possible fix is to perform these checks in the mlxsw parameter callbacks, but other drivers might be affected by the same problem and I am not aware of scenarios where these stricter checks will cause a regression. [1] mlxsw_spectrum3 0000:00:10.0: Port 125: Failed to register netdev mlxsw_spectrum3 0000:00:10.0: Failed to create ports ================================================================== BUG: KASAN: use-after-free in mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 Read of size 4 at addr ffff8880099dcfd8 by task kworker/u4:4/777 CPU: 1 PID: 777 Comm: kworker/u4:4 Not tainted 5.19.0-rc7-custom-126601-gfe26f28c586d #1 Hardware name: QEMU MSN4700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x92/0xbd lib/dump_stack.c:106 print_address_description mm/kasan/report.c:313 [inline] print_report.cold+0x5e/0x5cf mm/kasan/report.c:429 kasan_report+0xb9/0xf0 mm/kasan/report.c:491 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report_generic.c:306 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 mlxsw_sp_acl_region_rehash_intrvl_get+0x49/0x60 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c:1106 mlxsw_sp_params_acl_region_rehash_intrvl_get+0x33/0x80 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3854 devlink_param_get net/core/devlink.c:4981 [inline] devlink_nl_param_fill+0x238/0x12d0 net/core/devlink.c:5089 devlink_param_notify+0xe5/0x230 net/core/devlink.c:5168 devlink_ns_change_notify net/core/devlink.c:4417 [inline] devlink_ns_change_notify net/core/devlink.c:4396 [inline] devlink_reload+0x15f/0x700 net/core/devlink.c:4507 devlink_pernet_pre_exit+0x112/0x1d0 net/core/devlink.c:12272 ops_pre_exit_list net/core/net_namespace.c:152 [inline] cleanup_net+0x494/0xc00 net/core/net_namespace.c:582 process_one_work+0x9fc/0x1710 kernel/workqueue.c:2289 worker_thread+0x675/0x10b0 kernel/workqueue.c:2436 kthread+0x30c/0x3d0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The buggy address belongs to the physical page: page:ffffea0000267700 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99dc flags: 0x100000000000000(node=0|zone=1) raw: 0100000000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880099dce80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dcf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff8880099dcf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8880099dd000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dd080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 96cf4bc1f958..42c1bf6106e0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3620,7 +3620,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get) + if (!param->get || devlink->reload_failed) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3629,7 +3629,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set) + if (!param->set || devlink->reload_failed) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } -- Gitee From c34ab10d09a2f3cb48fa0752407c95e7420add9b Mon Sep 17 00:00:00 2001 From: Dokyung Song Date: Thu, 3 Nov 2022 18:49:18 +0800 Subject: [PATCH 09/18] wifi: Fix potential buffer overflow in 'brcmf_fweh_event_worker' maillist inclusion category: bugfix ISSUE:I60AC2 CVE: CVE-2022-3628 Reference: https://patchwork.kernel.org/project/linux-wireless/patch/20221021061359.GA550858@laguna/ Signed-off-by: gaochao -------------------------------- This patch fixes an intra-object buffer overflow in brcmfmac that occurs when the device provides a 'bsscfgidx' equal to or greater than the buffer size. The patch adds a check that leads to a safe failure if that is the case. This fixes CVE-2022-3628. UBSAN: array-index-out-of-bounds in drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c index 52 is out of range for type 'brcmf_if *[16]' CPU: 0 PID: 1898 Comm: kworker/0:2 Tainted: G O 5.14.0+ #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Workqueue: events brcmf_fweh_event_worker Call Trace: dump_stack_lvl+0x57/0x7d ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds+0x69/0x80 ? memcpy+0x39/0x60 brcmf_fweh_event_worker+0xae1/0xc00 ? brcmf_fweh_call_event_handler.isra.0+0x100/0x100 ? rcu_read_lock_sched_held+0xa1/0xd0 ? rcu_read_lock_bh_held+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x273/0x3e0 process_one_work+0x873/0x13e0 ? lock_release+0x640/0x640 ? pwq_dec_nr_in_flight+0x320/0x320 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x8b/0xd10 ? __kthread_parkme+0xd9/0x1d0 ? process_one_work+0x13e0/0x13e0 kthread+0x379/0x450 ? _raw_spin_unlock_irq+0x24/0x30 ? set_kthread_struct+0x100/0x100 ret_from_fork+0x1f/0x30 ================================================================================ general protection fault, probably for non-canonical address 0xe5601c0020023fff: 0000 [#1] SMP KASAN KASAN: maybe wild-memory-access in range [0x2b0100010011fff8-0x2b0100010011ffff] CPU: 0 PID: 1898 Comm: kworker/0:2 Tainted: G O 5.14.0+ #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Workqueue: events brcmf_fweh_event_worker RIP: 0010:brcmf_fweh_call_event_handler.isra.0+0x42/0x100 Code: 89 f5 53 48 89 fb 48 83 ec 08 e8 79 0b 38 fe 48 85 ed 74 7e e8 6f 0b 38 fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 8b 00 00 00 4c 8b 7d 00 44 89 e0 48 ba 00 00 00 RSP: 0018:ffffc9000259fbd8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: ffff888115d8cd50 RCX: 0000000000000000 RDX: 0560200020023fff RSI: ffffffff8304bc91 RDI: ffff888115d8cd50 RBP: 2b0100010011ffff R08: ffff888112340050 R09: ffffed1023549809 R10: ffff88811aa4c047 R11: ffffed1023549808 R12: 0000000000000045 R13: ffffc9000259fca0 R14: ffff888112340050 R15: ffff888112340000 FS: 0000000000000000(0000) GS:ffff88811aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000004053ccc0 CR3: 0000000112740000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: brcmf_fweh_event_worker+0x117/0xc00 ? brcmf_fweh_call_event_handler.isra.0+0x100/0x100 ? rcu_read_lock_sched_held+0xa1/0xd0 ? rcu_read_lock_bh_held+0xb0/0xb0 ? lockdep_hardirqs_on_prepare+0x273/0x3e0 process_one_work+0x873/0x13e0 ? lock_release+0x640/0x640 ? pwq_dec_nr_in_flight+0x320/0x320 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x8b/0xd10 ? __kthread_parkme+0xd9/0x1d0 ? process_one_work+0x13e0/0x13e0 kthread+0x379/0x450 ? _raw_spin_unlock_irq+0x24/0x30 ? set_kthread_struct+0x100/0x100 ret_from_fork+0x1f/0x30 Modules linked in: 88XXau(O) 88x2bu(O) ---[ end trace 41d302138f3ff55a ]--- RIP: 0010:brcmf_fweh_call_event_handler.isra.0+0x42/0x100 Code: 89 f5 53 48 89 fb 48 83 ec 08 e8 79 0b 38 fe 48 85 ed 74 7e e8 6f 0b 38 fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 8b 00 00 00 4c 8b 7d 00 44 89 e0 48 ba 00 00 00 RSP: 0018:ffffc9000259fbd8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: ffff888115d8cd50 RCX: 0000000000000000 RDX: 0560200020023fff RSI: ffffffff8304bc91 RDI: ffff888115d8cd50 RBP: 2b0100010011ffff R08: ffff888112340050 R09: ffffed1023549809 R10: ffff88811aa4c047 R11: ffffed1023549808 R12: 0000000000000045 R13: ffffc9000259fca0 R14: ffff888112340050 R15: ffff888112340000 FS: 0000000000000000(0000) GS:ffff88811aa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000004053ccc0 CR3: 0000000112740000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Reported-by: Dokyung Song Reported-by: Jisoo Jang Reported-by: Minsuk Kang Reviewed-by: Arend van Spriel Signed-off-by: Dokyung Song Signed-off-by: Liu Jian Reviewed-by: Yue Haibing Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c index 430d2cca98b3..1285d3685c4f 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fweh.c @@ -228,6 +228,10 @@ static void brcmf_fweh_event_worker(struct work_struct *work) brcmf_fweh_event_name(event->code), event->code, event->emsg.ifidx, event->emsg.bsscfgidx, event->emsg.addr); + if (event->emsg.bsscfgidx >= BRCMF_MAX_IFS) { + bphy_err(drvr, "invalid bsscfg index: %u\n", event->emsg.bsscfgidx); + goto event_free; + } /* convert event message */ emsg_be = &event->emsg; -- Gitee From 543da0d1f0f467e52f2e775c5163cdfe1a273e6d Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 8 Aug 2022 11:04:47 -0700 Subject: [PATCH 10/18] vsock: Fix memory leak in vsock_connect() stable inclusion from stable-5.10.138 commit 38ddccbda5e8b762c8ee06670bb1f64f1be5ee50 category: bugfix issue: I5YPMI CVE: CVE-2022-3629 Signed-off-by: gaochao --------------------------------------- commit 7e97cfed9929eaabc41829c395eb0d1350fccb9d upstream. An O_NONBLOCK vsock_connect() request may try to reschedule @connect_work. Imagine the following sequence of vsock_connect() requests: 1. The 1st, non-blocking request schedules @connect_work, which will expire after 200 jiffies. Socket state is now SS_CONNECTING; 2. Later, the 2nd, blocking request gets interrupted by a signal after a few jiffies while waiting for the connection to be established. Socket state is back to SS_UNCONNECTED, but @connect_work is still pending, and will expire after 100 jiffies. 3. Now, the 3rd, non-blocking request tries to schedule @connect_work again. Since @connect_work is already scheduled, schedule_delayed_work() silently returns. sock_hold() is called twice, but sock_put() will only be called once in vsock_connect_timeout(), causing a memory leak reported by syzbot: BUG: memory leak unreferenced object 0xffff88810ea56a40 (size 1232): comm "syz-executor756", pid 3604, jiffies 4294947681 (age 12.350s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace: [] sk_prot_alloc+0x3e/0x1b0 net/core/sock.c:1930 [] sk_alloc+0x32/0x2e0 net/core/sock.c:1989 [] __vsock_create.constprop.0+0x38/0x320 net/vmw_vsock/af_vsock.c:734 [] vsock_create+0xc1/0x2d0 net/vmw_vsock/af_vsock.c:2203 [] __sock_create+0x1ab/0x2b0 net/socket.c:1468 [] sock_create net/socket.c:1519 [inline] [] __sys_socket+0x6f/0x140 net/socket.c:1561 [] __do_sys_socket net/socket.c:1570 [inline] [] __se_sys_socket net/socket.c:1568 [inline] [] __x64_sys_socket+0x1a/0x20 net/socket.c:1568 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x44/0xae <...> Use mod_delayed_work() instead: if @connect_work is already scheduled, reschedule it, and undo sock_hold() to keep the reference count balanced. Reported-and-tested-by: syzbot+b03f55bf128f9a38f064@syzkaller.appspotmail.com Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Co-developed-by: Stefano Garzarella Signed-off-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Peilin Ye Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/af_vsock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 326250513570..e97165d9eff8 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1340,7 +1340,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, * timeout fires. */ sock_hold(sk); - schedule_delayed_work(&vsk->connect_work, timeout); + + /* If the timeout function is already scheduled, + * reschedule it, then ungrab the socket refcount to + * keep it balanced. + */ + if (mod_delayed_work(system_wq, &vsk->connect_work, + timeout)) + sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; -- Gitee From 05624f1097427a34a407b16a9dcf62329f5433ef Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Fri, 5 Aug 2022 18:02:16 +0300 Subject: [PATCH 11/18] can: j1939: j1939_session_destroy(): fix memory leak of skbs stable inclusion from stable-5.10.138 commit a220ff343396bae8d3b6abee72ab51f1f34b3027 category: bugfix issue: I5YPLT CVE: CVE-2022-3633 Signed-off-by: gaochao --------------------------------------- commit 8c21c54a53ab21842f5050fa090f26b03c0313d6 upstream. We need to drop skb references taken in j1939_session_skb_queue() when destroying a session in j1939_session_destroy(). Otherwise those skbs would be lost. Link to Syzkaller info and repro: https://forge.ispras.ru/issues/11743. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. V1: https://lore.kernel.org/all/20220708175949.539064-1-pchelkin@ispras.ru Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Suggested-by: Oleksij Rempel Signed-off-by: Fedor Pchelkin Signed-off-by: Alexey Khoroshilov Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/20220805150216.66313-1-pchelkin@ispras.ru Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index e59fbbffa31c..8322f9c23bb8 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,6 +260,8 @@ static void __j1939_session_drop(struct j1939_session *session) static void j1939_session_destroy(struct j1939_session *session) { + struct sk_buff *skb; + if (session->err) j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); else @@ -270,7 +272,11 @@ static void j1939_session_destroy(struct j1939_session *session) WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); - skb_queue_purge(&session->skb_queue); + while ((skb = skb_dequeue(&session->skb_queue)) != NULL) { + /* drop ref taken in j1939_session_skb_queue() */ + skb_unref(skb); + kfree_skb(skb); + } __j1939_session_drop(session); j1939_priv_put(session->priv); kfree(session); -- Gitee From caaff8f9f4a4760b568bea88528975d49a38f340 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 5 Aug 2022 15:00:08 +0800 Subject: [PATCH 12/18] atm: idt77252: fix use-after-free bugs caused by tst_timer stable inclusion from stable-5.10.138 commit a0ae122e9aeccbff75014c4d36d11a9d32e7fb5e category: bugfix issue: I5YPMB CVE: CVE-2022-3635 Signed-off-by: gaochao --------------------------------------- commit 3f4093e2bf4673f218c0bf17d8362337c400e77b upstream. There are use-after-free bugs caused by tst_timer. The root cause is that there are no functions to stop tst_timer in idt77252_exit(). One of the possible race conditions is shown below: (thread 1) | (thread 2) | idt77252_init_one | init_card | fill_tst | mod_timer(&card->tst_timer, ...) idt77252_exit | (wait a time) | tst_timer | | ... kfree(card) // FREE | | card->soft_tst[e] // USE The idt77252_dev is deallocated in idt77252_exit() and used in timer handler. This patch adds del_timer_sync() in idt77252_exit() in order that the timer handler could be stopped before the idt77252_dev is deallocated. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Link: https://lore.kernel.org/r/20220805070008.18007-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/atm/idt77252.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 5f0472c18bcb..82f6f1fbe9e7 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -3767,6 +3767,7 @@ static void __exit idt77252_exit(void) card = idt77252_chain; dev = card->atmdev; idt77252_chain = card->next; + del_timer_sync(&card->tst_timer); if (dev->phy->stop) dev->phy->stop(dev); -- Gitee From 2261bca97691b5380283dd3c3db0a3c7d60ff1ff Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 7 Oct 2022 17:52:26 +0900 Subject: [PATCH 13/18] nilfs2: fix leak of nilfs_root in case of writer thread creation failure mainline inclusion from mainline-6.1-rc1 commit d0d51a97063db4704a5ef6bc978dddab1636a306 category: bugfix issue: I5YPLN CVE: CVE-2022-3646 Signed-off-by: gaochao --------------------------------------- If nilfs_attach_log_writer() failed to create a log writer thread, it frees a data structure of the log writer without any cleanup. After commit e912a5b66837 ("nilfs2: use root object to get ifile"), this causes a leak of struct nilfs_root, which started to leak an ifile metadata inode and a kobject on that struct. In addition, if the kernel is booted with panic_on_warn, the above ifile metadata inode leak will cause the following panic when the nilfs2 kernel module is removed: kmem_cache_destroy nilfs2_inode_cache: Slab cache still has objects when called from nilfs_destroy_cachep+0x16/0x3a [nilfs2] WARNING: CPU: 8 PID: 1464 at mm/slab_common.c:494 kmem_cache_destroy+0x138/0x140 ... RIP: 0010:kmem_cache_destroy+0x138/0x140 Code: 00 20 00 00 e8 a9 55 d8 ff e9 76 ff ff ff 48 8b 53 60 48 c7 c6 20 70 65 86 48 c7 c7 d8 69 9c 86 48 8b 4c 24 28 e8 ef 71 c7 00 <0f> 0b e9 53 ff ff ff c3 48 81 ff ff 0f 00 00 77 03 31 c0 c3 53 48 ... Call Trace: ? nilfs_palloc_freev.cold.24+0x58/0x58 [nilfs2] nilfs_destroy_cachep+0x16/0x3a [nilfs2] exit_nilfs_fs+0xa/0x1b [nilfs2] __x64_sys_delete_module+0x1d9/0x3a0 ? __sanitizer_cov_trace_pc+0x1a/0x50 ? syscall_trace_enter.isra.19+0x119/0x190 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ... Kernel panic - not syncing: panic_on_warn set ... This patch fixes these issues by calling nilfs_detach_log_writer() cleanup function if spawning the log writer thread fails. Link: https://lkml.kernel.org/r/20221007085226.57667-1-konishi.ryusuke@gmail.com Fixes: e912a5b66837 ("nilfs2: use root object to get ifile") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+7381dc4ad60658ca4c05@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index e3726aca28ed..6f5ccfc8819e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2788,10 +2788,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; } -- Gitee From bf05f7afd9460f3a28bc0ab1ebcb4a090d30abe7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Sep 2022 10:26:18 +0200 Subject: [PATCH 14/18] netfilter: nfnetlink_osf: fix possible bogus match in nf_osf_find() stable inclusion from stable-5.10.146 commit 5d75fef3e61e797fab5c3fbba88caa74ab92ad47 category: bugfix issue: I5YPLH CVE: CVE-2022-42432 Signed-off-by: gaochao --------------------------------------- [ Upstream commit 559c36c5a8d730c49ef805a72b213d3bba155cc8 ] nf_osf_find() incorrectly returns true on mismatch, this leads to copying uninitialized memory area in nft_osf which can be used to leak stale kernel stack data to userspace. Fixes: 22c7652cdaa8 ("netfilter: nft_osf: Add version option support") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- net/netfilter/nfnetlink_osf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 79fbf37291f3..51e3953b414c 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false; memset(&ctx, 0, sizeof(ctx)); @@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, data->genre = f->genre; data->version = f->version; + found = true; break; } - return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find); -- Gitee From c1378eef7b6e9fb05a24e7f6462a717967d9cf7d Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 19 Sep 2022 14:59:57 -0700 Subject: [PATCH 15/18] usb: mon: make mmapped memory read only mainline inclusion from mainline-6.1-rc1 commit a659daf63d16aa883be42f3f34ff84235c302198 category: bugfix issue: I5Z39B CVE: CVE-2022-43750 Signed-off-by: gaochao --------------------------------------- Syzbot found an issue in usbmon module, where the user space client can corrupt the monitor's internal memory, causing the usbmon module to crash the kernel with segfault, UAF, etc. The reproducer mmaps the /dev/usbmon memory to user space, and overwrites it with arbitrary data, which causes all kinds of issues. Return an -EPERM error from mon_bin_mmap() if the flag VM_WRTIE is set. Also clear VM_MAYWRITE to make it impossible to change it to writable later. Cc: "Dmitry Vyukov" Cc: stable Fixes: 6f23ee1fefdc ("USB: add binary API to usbmon") Suggested-by: PaX Team # for the VM_MAYRITE portion Link: https://syzkaller.appspot.com/bug?id=2eb1f35d6525fa4a74d75b4244971e5b1411c95a Reported-by: syzbot+23f57c5ae902429285d7@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Link: https://lore.kernel.org/r/20220919215957.205681-1-tadeusz.struk@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mon/mon_bin.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index f48a23adbc35..094e812e9e69 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1268,6 +1268,11 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); -- Gitee From 25926da2f9b57a13fde8ed6f586271e0089980c8 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 3 Nov 2022 18:49:10 +0800 Subject: [PATCH 16/18] mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.1-rc1 commit 16ce101db85db694a91380aa4c89b25530871d33 category: bugfix issue: I5XD82 CVE: CVE-2022-3523 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16ce101db85db694a91380aa4c89b25530871d33 Signed-off-by: gaochao -------------------------------- Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Conflicts: arch/powerpc/kvm/book3s_hv_uvmem.c include/linux/migrate.h lib/test_hmm.c mm/migrate.c Signed-off-by: Ma Wupeng Reviewed-by: tong tiangen Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++++------ include/linux/migrate.h | 9 +++++++ lib/test_hmm.c | 5 ++-- mm/memory.c | 16 +++++++++++- mm/migrate.c | 42 ++++++++++++++++++++---------- 5 files changed, 66 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 84e5a2dc8be5..9168238860b8 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -504,10 +504,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -521,6 +521,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -576,12 +577,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -630,7 +633,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -733,7 +736,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -991,7 +994,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1062,7 +1065,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0f8d1583fa8e..a9de6d3ae07d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -36,6 +36,9 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +extern int migrate_page_extra(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode, int extra_count); extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); @@ -190,6 +193,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 80a78877bd93..a66a7aebef35 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -671,7 +671,7 @@ static int dmirror_migrate(struct dmirror *dmirror, unsigned long src_pfns[64]; unsigned long dst_pfns[64]; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1024,7 +1024,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns; unsigned long dst_pfns; struct page *rpage; @@ -1047,6 +1047,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 4fe24cd865a7..4ad44bd325e5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3320,7 +3320,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address); } else if (is_device_private_entry(entry)) { vmf->page = device_private_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 278e6f3fa62c..0758aa3836f7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -690,21 +690,15 @@ EXPORT_SYMBOL(migrate_page_copy); * Migration functions ***********************************************************/ -/* - * Common logic to directly migrate a single LRU page suitable for - * pages that do not use PagePrivate/PagePrivate2. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct address_space *mapping, +int migrate_page_extra(struct address_space *mapping, struct page *newpage, struct page *page, - enum migrate_mode mode) + enum migrate_mode mode, int extra_count) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -715,6 +709,19 @@ int migrate_page(struct address_space *mapping, migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } + +/* + * Common logic to directly migrate a single LRU page suitable for + * pages that do not use PagePrivate/PagePrivate2. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return migrate_page_extra(mapping, newpage, page, mode, 0); +} EXPORT_SYMBOL(migrate_page); #ifdef CONFIG_BLOCK @@ -2522,14 +2529,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * migrate_page_move_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -2637,7 +2644,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) put_page(page); } - if (!migrate_vma_check_page(page)) { + if (!migrate_vma_check_page(page, migrate->fault_page)) { if (remap) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; migrate->cpages--; @@ -2705,7 +2712,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) goto restore; } - if (migrate_vma_check_page(page)) + if (migrate_vma_check_page(page, migrate->fault_page)) continue; restore: @@ -2815,6 +2822,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -3045,7 +3054,12 @@ void migrate_vma_pages(struct migrate_vma *migrate) } } - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_page_extra(mapping, newpage, page, + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_page(mapping, newpage, page, + MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } -- Gitee From 7bf6d3c18141337e54d4ae19d5a02952eddfbd6e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 21 May 2022 12:19:15 +0800 Subject: [PATCH 17/18] ipv6: annotate some data-races around sk->sk_prot mainline inclusion from mainline-v5.18-rc1 commit 086d49058cd8471046ae9927524708820f5fd1c7 category: bugfix ISSUE:I5YPMQ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=086d49058cd8471046ae9927524708820f5fd1c7 Signed-off-by: gaochao -------------------------------- IPv6 has this hack changing sk->sk_prot when an IPv6 socket is 'converted' to an IPv4 one with IPV6_ADDRFORM option. This operation is only performed for TCP and UDP, knowing their 'struct proto' for the two network families are populated in the same way, and can not disappear while a reader might use and dereference sk->sk_prot. If we think about it all reads of sk->sk_prot while either socket lock or RTNL is not acquired should be using READ_ONCE(). Also note that other layers like MPTCP, XFRM, CHELSIO_TLS also write over sk->sk_prot. BUG: KCSAN: data-race in inet6_recvmsg / ipv6_setsockopt write to 0xffff8881386f7aa8 of 8 bytes by task 26932 on cpu 0: do_ipv6_setsockopt net/ipv6/ipv6_sockglue.c:492 [inline] ipv6_setsockopt+0x3758/0x3910 net/ipv6/ipv6_sockglue.c:1019 udpv6_setsockopt+0x85/0x90 net/ipv6/udp.c:1649 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3489 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881386f7aa8 of 8 bytes by task 26911 on cpu 1: inet6_recvmsg+0x7a/0x210 net/ipv6/af_inet6.c:659 ____sys_recvmsg+0x16c/0x320 ___sys_recvmsg net/socket.c:2674 [inline] do_recvmmsg+0x3f5/0xae0 net/socket.c:2768 __sys_recvmmsg net/socket.c:2847 [inline] __do_sys_recvmmsg net/socket.c:2870 [inline] __se_sys_recvmmsg net/socket.c:2863 [inline] __x64_sys_recvmmsg+0xde/0x160 net/socket.c:2863 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffffffff85e0e980 -> 0xffffffff85e01580 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 26911 Comm: syz-executor.3 Not tainted 5.17.0-rc2-syzkaller-00316-g0457e5153e0e-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller conflicts: net/ipv6/af_inet6.c Signed-off-by: Xu Jia Reviewed-by: Wei Yongjun Reviewed-by: Yue Haibing Signed-off-by: Zheng Zengkai --- net/ipv6/af_inet6.c | 23 +++++++++++++++++------ net/ipv6/ipv6_sockglue.c | 6 ++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..0f54e6843f0a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -439,11 +439,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + const struct proto *prot; int err = 0; /* If the socket has its own bind function then use it. */ - if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); + prot = READ_ONCE(sk->sk_prot); + if (prot->bind) + return prot->bind(sk, uaddr, addr_len); if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -550,6 +552,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto *prot; switch (cmd) { case SIOCADDRT: @@ -567,9 +570,11 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, argp); default: - if (!sk->sk_prot->ioctl) + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (!prot->ioctl) return -ENOIOCTLCMD; - return sk->sk_prot->ioctl(sk, cmd, arg); + return prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return 0; @@ -631,11 +636,14 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, sk, msg, size); } @@ -645,13 +653,16 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); - err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b19eedff6bf5..6cfb4042994e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -479,7 +479,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); - sk->sk_prot = &tcp_prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); sk->sk_socket->ops = &inet_stream_ops; @@ -494,7 +495,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); local_bh_enable(); - sk->sk_prot = prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } -- Gitee From 69d16596666831af075c0cd9f702c3813139b39d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Oct 2022 18:36:29 +0800 Subject: [PATCH 18/18] ipv6: Fix data races around sk->sk_prot. mainline inclusion from mainline-6.1-rc1 commit 364f997b5cfe1db0d63a390fe7c801fa2b3115f6 category: bugfix issue: I5YPMQ CVE: CVE-2022-3567 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=364f997b5cfe1db0d63a390fe7c801fa2b3115f6 Signed-off-by: gaochao --------------------------- Commit 086d49058cd8 ("ipv6: annotate some data-races around sk->sk_prot") fixed some data-races around sk->sk_prot but it was not enough. Some functions in inet6_(stream|dgram)_ops still access sk->sk_prot without lock_sock() or rtnl_lock(), so they need READ_ONCE() to avoid load tearing. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Liu Jian Conflicts: net/ipv6/ipv6_sockglue.c Reviewed-by: Yue Haibing Reviewed-by: Xiu Jianfeng Signed-off-by: Zheng Zengkai --- net/core/sock.c | 6 ++++-- net/ipv4/af_inet.c | 23 ++++++++++++++++------- net/ipv6/ipv6_sockglue.c | 4 ++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 850925cb7fc0..aeaa3dc50049 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3226,7 +3226,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); @@ -3253,7 +3254,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8267349afe23..ffa999a5b210 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -559,22 +559,27 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); + return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); @@ -735,10 +740,11 @@ EXPORT_SYMBOL(inet_stream_connect); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { - struct sock *sk1 = sock->sk; + struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) goto do_err; @@ -823,12 +829,15 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->sendpage) + return prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(inet_sendpage); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 6cfb4042994e..65f83ce728fa 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -479,7 +479,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); @@ -495,7 +495,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); local_bh_enable(); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; -- Gitee