From 5cfe6d260473e239b776ef1584cb39dd96a1dfab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 7 Jun 2025 16:41:39 +0800 Subject: [PATCH 1/5] mm/migrate: use folio_likely_mapped_shared() in add_page_for_migration() mainline inclusion from mainline-v6.10-rc1 commit 31ce0d7ef8412e3f38cf90bb90b7436130c60614 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IC2YHO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=31ce0d7ef8412e3f38cf90bb90b7436130c60614 ------------------------------------------- We want to limit the use of page_mapcount() to the places where it is absolutely necessary. In add_page_for_migration(), we actually want to check if the folio is mapped shared, to reject such folios. So let's use folio_likely_mapped_shared() instead. For small folios, fully mapped THP, and hugetlb folios, there is no change. For partially mapped, shared THP, we should now do a better job at rejecting such folios. Link: https://lkml.kernel.org/r/20240409192301.907377-12-david@redhat.com Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yin Fengwei Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Jinjiang Tu --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1462848ce385..710fb5610fd3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2178,7 +2178,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; err = -EACCES; - if (page_mapcount(page) > 1 && !migrate_all) + if (folio_likely_mapped_shared(folio) && !migrate_all) goto out_putfolio; err = -EBUSY; -- Gitee From 0c29c18aa0f525089c23bec1ea4e8d4d0d84806e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sat, 7 Jun 2025 16:41:40 +0800 Subject: [PATCH 2/5] mm: introduce vma_pgtable_walk_{begin|end}() mainline inclusion from mainline-v6.10-rc1 commit 239e9a90c887d33e9339870a650d2bd621b95674 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IC2YHO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=239e9a90c887d33e9339870a650d2bd621b95674 ------------------------------------------- Introduce per-vma begin()/end() helpers for pgtable walks. This is a preparation work to merge hugetlb pgtable walkers with generic mm. The helpers need to be called before and after a pgtable walk, will start to be needed if the pgtable walker code supports hugetlb pages. It's a hook point for any type of VMA, but for now only hugetlb uses it to stablize the pgtable pages from getting away (due to possible pmd unsharing). Link: https://lkml.kernel.org/r/20240327152332.950956-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Christoph Hellwig Reviewed-by: Muchun Song Tested-by: Ryan Roberts Cc: Andrea Arcangeli Cc: Andrew Jones Cc: Aneesh Kumar K.V (IBM) Cc: Axel Rasmussen Cc: Christophe Leroy Cc: David Hildenbrand Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Mike Rapoport (IBM)" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Conflicts: include/linux/mm.h mm/migrate.c [Context conflicts.] Signed-off-by: Jinjiang Tu --- include/linux/mm.h | 3 +++ mm/memory.c | 12 ++++++++++++ mm/migrate.c | 1 + 3 files changed, 16 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index b974880cf283..d24d6115a9bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4205,6 +4205,9 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) #endif +void vma_pgtable_walk_begin(struct vm_area_struct *vma); +void vma_pgtable_walk_end(struct vm_area_struct *vma); + /* added to mm.h to avoid every caller adding new header file */ #include diff --git a/mm/memory.c b/mm/memory.c index abc7f1645831..c76a1dae75d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6863,3 +6863,15 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +} diff --git a/mm/migrate.c b/mm/migrate.c index 710fb5610fd3..26fad79380b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include #include #include +#include #include -- Gitee From eb6d298e49a22a61e4a25377329a81445bed4611 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 7 Jun 2025 16:41:41 +0800 Subject: [PATCH 3/5] mm/pagewalk: introduce folio_walk_start() + folio_walk_end() mainline inclusion from mainline-v6.12-rc1 commit aa39ca6940f1a0540f2984051b3089972f42959b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IC2YHO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aa39ca6940f1a0540f2984051b3089972f42959b ------------------------------------------- We want to get rid of follow_page(), and have a more reasonable way to just lookup a folio mapped at a certain address, perform some checks while still under PTL, and then only conditionally grab a folio reference if really required. Further, we might want to get rid of some walk_page_range*() users that really only want to temporarily lookup a single folio at a single address. So let's add a new page table walker that does exactly that, similarly to GUP also being able to walk hugetlb VMAs. Add folio_walk_end() as a macro for now: the compiler is not easy to please with the pte_unmap()->kunmap_local(). Note that one difference between follow_page() and get_user_pages(1) is that follow_page() will not trigger faults to get something mapped. So folio_walk is at least currently not a replacement for get_user_pages(1), but could likely be extended/reused to achieve something similar in the future. Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Janosch Frank Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Jinjiang Tu --- include/linux/pagewalk.h | 58 +++++++++++ mm/pagewalk.c | 202 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27cd1e59ccf7..f5eb5a32aeed 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); +typedef int __bitwise folio_walk_flags_t; + +/* + * Walk migration entries as well. Careful: a large folio might get split + * concurrently. + */ +#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) + +/* Walk shared zeropages (small + huge) as well. */ +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) + +enum folio_walk_level { + FW_LEVEL_PTE, + FW_LEVEL_PMD, + FW_LEVEL_PUD, +}; + +/** + * struct folio_walk - folio_walk_start() / folio_walk_end() data + * @page: exact folio page referenced (if applicable) + * @level: page table level identifying the entry type + * @pte: pointer to the page table entry (FW_LEVEL_PTE). + * @pmd: pointer to the page table entry (FW_LEVEL_PMD). + * @pud: pointer to the page table entry (FW_LEVEL_PUD). + * @ptl: pointer to the page table lock. + * + * (see folio_walk_start() documentation for more details) + */ +struct folio_walk { + /* public */ + struct page *page; + enum folio_walk_level level; + union { + pte_t *ptep; + pud_t *pudp; + pmd_t *pmdp; + }; + union { + pte_t pte; + pud_t pud; + pmd_t pmd; + }; + /* private */ + struct vm_area_struct *vma; + spinlock_t *ptl; +}; + +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags); + +#define folio_walk_end(__fw, __vma) do { \ + spin_unlock((__fw)->ptl); \ + if (likely((__fw)->level == FW_LEVEL_PTE)) \ + pte_unmap((__fw)->ptep); \ + vma_pgtable_walk_end(__vma); \ +} while (0) + #endif /* _LINUX_PAGEWALK_H */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 144c9fed0fc9..cb6982ec823a 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include /* * We want to know the real level where a entry is located ignoring any @@ -677,3 +679,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, return err; } + +/** + * folio_walk_start - walk the page tables to a folio + * @fw: filled with information on success. + * @vma: the VMA. + * @addr: the virtual address to use for the page table walk. + * @flags: flags modifying which folios to walk to. + * + * Walk the page tables using @addr in a given @vma to a mapped folio and + * return the folio, making sure that the page table entry referenced by + * @addr cannot change until folio_walk_end() was called. + * + * As default, this function returns only folios that are not special (e.g., not + * the zeropage) and never returns folios that are supposed to be ignored by the + * VM as documented by vm_normal_page(). If requested, zeropages will be + * returned as well. + * + * As default, this function only considers present page table entries. + * If requested, it will also consider migration entries. + * + * If this function returns NULL it might either indicate "there is nothing" or + * "there is nothing suitable". + * + * On success, @fw is filled and the function returns the folio while the PTL + * is still held and folio_walk_end() must be called to clean up, + * releasing any held locks. The returned folio must *not* be used after the + * call to folio_walk_end(), unless a short-term folio reference is taken before + * that call. + * + * @fw->page will correspond to the page that is effectively referenced by + * @addr. However, for migration entries and shared zeropages @fw->page is + * set to NULL. Note that large folios might be mapped by multiple page table + * entries, and this function will always only lookup a single entry as + * specified by @addr, which might or might not cover more than a single page of + * the returned folio. + * + * This function must *not* be used as a naive replacement for + * get_user_pages() / pin_user_pages(), especially not to perform DMA or + * to carelessly modify page content. This function may *only* be used to grab + * short-term folio references, never to grab long-term folio references. + * + * Using the page table entry pointers in @fw for reading or modifying the + * entry should be avoided where possible: however, there might be valid + * use cases. + * + * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. + * For example, PMD page table sharing might require prior unsharing. Also, + * logical hugetlb entries might span multiple physical page table entries, + * which *must* be modified in a single operation (set_huge_pte_at(), + * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might + * not correspond to the first physical entry of a logical hugetlb entry. + * + * The mmap lock must be held in read mode. + * + * Return: folio pointer on success, otherwise NULL. + */ +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags) +{ + unsigned long entry_size; + bool expose_page = true; + struct page *page; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + pgd_t *pgdp; + p4d_t *p4dp; + + mmap_assert_locked(vma->vm_mm); + vma_pgtable_walk_begin(vma); + + if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) + goto not_found; + + pgdp = pgd_offset(vma->vm_mm, addr); + if (pgd_none_or_clear_bad(pgdp)) + goto not_found; + + p4dp = p4d_offset(pgdp, addr); + if (p4d_none_or_clear_bad(p4dp)) + goto not_found; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (pud_none(pud)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + ptl = pud_lock(vma->vm_mm, pudp); + pud = pudp_get(pudp); + + entry_size = PUD_SIZE; + fw->level = FW_LEVEL_PUD; + fw->pudp = pudp; + fw->pud = pud; + + if (!pud_present(pud) || pud_devmap(pud)) { + spin_unlock(ptl); + goto not_found; + } else if (!pud_leaf(pud)) { + spin_unlock(ptl); + goto pmd_table; + } + /* + * TODO: vm_normal_page_pud() will be handy once we want to + * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + */ + page = pud_page(pud); + goto found; + } + +pmd_table: + VM_WARN_ON_ONCE(pud_leaf(*pudp)); + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_get(pmdp); + + entry_size = PMD_SIZE; + fw->level = FW_LEVEL_PMD; + fw->pmdp = pmdp; + fw->pmd = pmd; + + if (pmd_none(pmd)) { + spin_unlock(ptl); + goto not_found; + } else if (!pmd_leaf(pmd)) { + spin_unlock(ptl); + goto pte_table; + } else if (pmd_present(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page) { + goto found; + } else if ((flags & FW_ZEROPAGE) && + is_huge_zero_pmd(pmd)) { + page = pfn_to_page(pmd_pfn(pmd)); + expose_page = false; + goto found; + } + } else if ((flags & FW_MIGRATION) && + is_pmd_migration_entry(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + spin_unlock(ptl); + goto not_found; + } + +pte_table: + VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); + if (!ptep) + goto not_found; + pte = ptep_get(ptep); + + entry_size = PAGE_SIZE; + fw->level = FW_LEVEL_PTE; + fw->ptep = ptep; + fw->pte = pte; + + if (pte_present(pte)) { + page = vm_normal_page(vma, addr, pte); + if (page) + goto found; + if ((flags & FW_ZEROPAGE) && + is_zero_pfn(pte_pfn(pte))) { + page = pfn_to_page(pte_pfn(pte)); + expose_page = false; + goto found; + } + } else if (!pte_none(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if ((flags & FW_MIGRATION) && + is_migration_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + } + pte_unmap_unlock(ptep, ptl); +not_found: + vma_pgtable_walk_end(vma); + return NULL; +found: + if (expose_page) + /* Note: Offset from the mapped page, not the folio start. */ + fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + else + fw->page = NULL; + fw->ptl = ptl; + return page_folio(page); +} -- Gitee From ec3b8029bf8898fc6e09870e4036aeb61f4fef52 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 7 Jun 2025 16:41:42 +0800 Subject: [PATCH 4/5] mm/pagewalk: fix usage of pmd_leaf()/pud_leaf() without present check mainline inclusion from mainline-v6.12-rc6 commit 7c18d4811000945677a8531e89de3e17582e8a36 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IC2YHO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7c18d4811000945677a8531e89de3e17582e8a36 ------------------------------------------- pmd_leaf()/pud_leaf() only implies a pmd_present()/pud_present() check on some architectures. We really should check for pmd_present()/pud_present() first. This should explain the report we got on ppc64 (which has CONFIG_PGTABLE_HAS_HUGE_LEAVES set in the config) that triggered: VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); Likely we had a PMD migration entry for which pmd_leaf() did not trigger. We raced with restoring the PMD migration entry, and suddenly saw a pmd_leaf(). In this case, pte_offset_map_lock() saved us from more trouble, because it rechecks the PMD value, but we would not have processed the migration entry -- which is not too bad because the only user of FW_MIGRATION is KSM for unsharing, and KSM only applies to small folios. Further, we shouldn't re-read the PMD/PUD value for our warning, the primary purpose of the VM_WARN_ON_ONCE() is to find spurious use of pmd_leaf()/pud_leaf() without CONFIG_PGTABLE_HAS_HUGE_LEAVES. As a side note, we are currently not implementing FW_MIGRATION support for PUD migration entries, which likely should exist due to hugetlb. Add a TODO so this won't fall through the cracks if more FW_MIGRATION users get added. Was able to write a quick reproducer and verify that the issue no longer triggers with this fix. https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/reproducers/move-pages-pmd-leaf.c Without this fix after a couple of seconds in a VM with 2 NUMA nodes: [ 54.333753] ------------[ cut here ]------------ [ 54.334901] WARNING: CPU: 20 PID: 1704 at mm/pagewalk.c:815 folio_walk_start+0x48f/0x6e0 [ 54.336455] Modules linked in: ... [ 54.345009] CPU: 20 UID: 0 PID: 1704 Comm: move-pages-pmd- Not tainted 6.12.0-rc2+ #81 [ 54.346529] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 [ 54.348191] RIP: 0010:folio_walk_start+0x48f/0x6e0 [ 54.349134] Code: b5 ad 48 8d 35 00 00 00 00 e8 6d 59 d7 ff e8 08 74 da ff e9 9c fe ff ff 4c 8b 7c 24 08 4c 89 ff e8 26 2b be 00 e9 8a fe ff ff <0f> 0b e9 ec fe ff ff f7 c2 ff 0f 00 00 0f 85 81 fe ff ff 48 8b 02 [ 54.352660] RSP: 0018:ffffb7e4c430bc78 EFLAGS: 00010282 [ 54.353679] RAX: 80000002a3e008e7 RBX: ffff9946039aa580 RCX: ffff994380000000 [ 54.355056] RDX: ffff994606aec000 RSI: 00007f004b000000 RDI: 0000000000000000 [ 54.356440] RBP: 00007f004b000000 R08: 0000000000000591 R09: 0000000000000001 [ 54.357820] R10: 0000000000000200 R11: 0000000000000001 R12: ffffb7e4c430bd10 [ 54.359198] R13: ffff994606aec2c0 R14: 0000000000000002 R15: ffff994604a89b00 [ 54.360564] FS: 00007f004ae006c0(0000) GS:ffff9947f7400000(0000) knlGS:0000000000000000 [ 54.362111] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 54.363242] CR2: 00007f004adffe58 CR3: 0000000281e12005 CR4: 0000000000770ef0 [ 54.364615] PKRU: 55555554 [ 54.365153] Call Trace: [ 54.365646] [ 54.366073] ? __warn.cold+0xb7/0x14d [ 54.366796] ? folio_walk_start+0x48f/0x6e0 [ 54.367628] ? report_bug+0xff/0x140 [ 54.368324] ? handle_bug+0x58/0x90 [ 54.369019] ? exc_invalid_op+0x17/0x70 [ 54.369771] ? asm_exc_invalid_op+0x1a/0x20 [ 54.370606] ? folio_walk_start+0x48f/0x6e0 [ 54.371415] ? folio_walk_start+0x9e/0x6e0 [ 54.372227] do_pages_move+0x1c5/0x680 [ 54.372972] kernel_move_pages+0x1a1/0x2b0 [ 54.373804] __x64_sys_move_pages+0x25/0x30 Link: https://lkml.kernel.org/r/20241015111236.1290921-1-david@redhat.com Fixes: aa39ca6940f1 ("mm/pagewalk: introduce folio_walk_start() + folio_walk_end()") Signed-off-by: David Hildenbrand Reported-by: syzbot+7d917f67c05066cec295@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/670d3248.050a0220.3e960.0064.GAE@google.com Acked-by: Kirill A. Shutemov Acked-by: Qi Zheng Cc: Jann Horn Signed-off-by: Andrew Morton Conflicts: mm/pagewalk.c [Context conflicts.] Signed-off-by: Jinjiang Tu --- mm/pagewalk.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb6982ec823a..e439cf74fb8e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -767,7 +767,8 @@ struct folio *folio_walk_start(struct folio_walk *fw, pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); @@ -776,6 +777,10 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; + /* + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. + */ if (!pud_present(pud) || pud_devmap(pud)) { spin_unlock(ptl); goto not_found; @@ -792,12 +797,13 @@ struct folio *folio_walk_start(struct folio_walk *fw, } pmd_table: - VM_WARN_ON_ONCE(pud_leaf(*pudp)); + VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); @@ -809,7 +815,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; - } else if (!pmd_leaf(pmd)) { + } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { @@ -835,7 +841,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, } pte_table: - VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; -- Gitee From b5d854c9a1bc7c1fc35a6d23b450fa14a7802703 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 7 Jun 2025 16:41:43 +0800 Subject: [PATCH 5/5] mm/migrate: convert add_page_for_migration() from follow_page() to folio_walk mainline inclusion from mainline-v6.12-rc1 commit 7dff875c9436a9df2f93cf59a32630761565af99 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IC2YHO CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7dff875c9436a9df2f93cf59a32630761565af99 ------------------------------------------- Let's use folio_walk instead, so we can avoid taking a folio reference when we won't even be trying to migrate the folio and to get rid of another follow_page()/FOLL_DUMP user. Use FW_ZEROPAGE so we can return "-EFAULT" for it as documented. We now perform the folio_likely_mapped_shared() check under PTL, which is what we want: relying on the mapcount and friends after dropping the PTL does not make too much sense, as the page can get unmapped concurrently from this process. Further, we perform the folio isolation under PTL, similar to how we handle it for MADV_PAGEOUT. The possible return values for follow_page() were confusing, especially with FOLL_DUMP set. We'll handle it like documented in the man page: * -EFAULT: This is a zero page or the memory area is not mapped by the process. * -ENOENT: The page is not present. We'll keep setting -ENOENT for ZONE_DEVICE. Maybe not the right thing to do, but it likely doesn't really matter (just like for weird devmap, whereby we fake "not present"). The other errros are left as is, and match the documentation in the man page. While at it, rename add_page_for_migration() to add_folio_for_migration(). We'll lose the "secretmem" check, but that shouldn't really matter because these folios cannot ever be migrated. Should vma_migratable() refuse these VMAs? Maybe. Link: https://lkml.kernel.org/r/20240802155524.517137-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Janosch Frank Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Conflicts: mm/migrate.c [is_huge_zero_folio() isn't introduced.] Signed-off-by: Jinjiang Tu --- mm/migrate.c | 100 +++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 55 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 26fad79380b1..d2f0b03fe935 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2133,76 +2133,66 @@ static int do_move_pages_to_node(struct mm_struct *mm, return err; } +static int __add_folio_for_migration(struct folio *folio, int node, + struct list_head *pagelist, bool migrate_all) +{ + if (is_zero_folio(folio) || is_huge_zero_page(&folio->page)) + return -EFAULT; + + if (folio_is_zone_device(folio)) + return -ENOENT; + + if (folio_nid(folio) == node) + return 0; + + if (folio_likely_mapped_shared(folio) && !migrate_all) + return -EACCES; + + if (folio_test_hugetlb(folio)) { + if (isolate_hugetlb(folio, pagelist)) + return 1; + } else if (folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, pagelist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + return 1; + } + return -EBUSY; +} + /* - * Resolves the given address to a struct page, isolates it from the LRU and + * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: - * errno - if the page cannot be found/isolated + * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued */ -static int add_page_for_migration(struct mm_struct *mm, const void __user *p, +static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; - unsigned long addr; - struct page *page; + struct folio_walk fw; struct folio *folio; - int err; + unsigned long addr; + int err = -EFAULT; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); - err = -EFAULT; vma = vma_lookup(mm, addr); - if (!vma || !vma_migratable(vma)) - goto out; - - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - - err = -ENOENT; - if (!page) - goto out; - - folio = page_folio(page); - if (folio_is_zone_device(folio)) - goto out_putfolio; - - err = 0; - if (folio_nid(folio) == node) - goto out_putfolio; - - err = -EACCES; - if (folio_likely_mapped_shared(folio) && !migrate_all) - goto out_putfolio; - - err = -EBUSY; - if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) - err = 1; - } else { - if (!folio_isolate_lru(folio)) - goto out_putfolio; - - err = 1; - list_add_tail(&folio->lru, pagelist); - node_stat_mod_folio(folio, - NR_ISOLATED_ANON + folio_is_file_lru(folio), - folio_nr_pages(folio)); + if (vma && vma_migratable(vma)) { + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + err = __add_folio_for_migration(folio, node, pagelist, + migrate_all); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } } -out_putfolio: - /* - * Either remove the duplicate refcount from folio_isolate_lru() - * or drop the folio ref if it was not isolated. - */ - folio_put(folio); -out: mmap_read_unlock(mm); return err; } @@ -2296,8 +2286,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ - err = add_page_for_migration(mm, p, current_node, &pagelist, - flags & MPOL_MF_MOVE_ALL); + err = add_folio_for_migration(mm, p, current_node, &pagelist, + flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ -- Gitee