From d9e1272d34deff30a23129067af94e8fca741001 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 2 Sep 2024 20:55:11 +0800 Subject: [PATCH 01/22] mm: ksm: use more folio api in ksm_might_need_to_copy() mainline inclusion from mainline-v6.8-rc1 commit 1486fb50136f4799946f5ecfe050094574647153 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1486fb50136f4799946f5ecfe050094574647153 -------------------------------- Patch series "mm: cleanup and use more folio in page fault", v3. Rename page_copy_prealloc() to folio_prealloc(), which is used by more functions, also do more folio conversion in page fault. This patch (of 5): Since ksm only support normal page, no swapout/in for ksm large folio too, add large folio check in ksm_might_need_to_copy(), also convert page->index to folio->index as page->index is going away. Then convert ksm_might_need_to_copy() to use more folio api to save nine compound_head() calls, short 'address' to reduce max-line-length. Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton [ Dep-of: 96db66d9c8f3 ] Signed-off-by: Liu Shixin --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 49779f85e1da..2774e5c23500 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm) * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); + struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); @@ -128,7 +128,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, } static inline struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { return page; } diff --git a/mm/ksm.c b/mm/ksm.c index e0a2f9d58db3..8ecbf01041d2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2794,48 +2794,51 @@ void __ksm_exit(struct mm_struct *mm) } struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = page_folio(page); struct anon_vma *anon_vma = folio_anon_vma(folio); - struct page *new_page; + struct folio *new_folio; - if (PageKsm(page)) { - if (page_stable_node(page) && + if (folio_test_large(folio)) + return page; + + if (folio_test_ksm(folio)) { + if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (page->index == linear_page_index(vma, address) && + } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) return page; /* let do_swap_page report the error */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page && - mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { - put_page(new_page); - new_page = NULL; + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (new_folio && + mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(new_folio); + new_folio = NULL; } - if (new_page) { - if (copy_mc_user_highpage(new_page, page, address, vma)) { - put_page(new_page); + if (new_folio) { + if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + folio_put(new_folio); memory_failure_queue(page_to_pfn(page), 0); return ERR_PTR(-EHWPOISON); } - SetPageDirty(new_page); - __SetPageUptodate(new_page); - __SetPageLocked(new_page); + folio_set_dirty(new_folio); + __folio_mark_uptodate(new_folio); + __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } - return new_page; + return new_folio ? &new_folio->page : NULL; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) -- Gitee From 7eca1ddf9bb362991b486a1697d76e346e6d8566 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 2 Sep 2024 20:55:12 +0800 Subject: [PATCH 02/22] mm: memory: use a folio in validate_page_before_insert() mainline inclusion from mainline-v6.8-rc1 commit f8b6187d8dd98fd32fe393071f362a7b6beaad0a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f8b6187d8dd98fd32fe393071f362a7b6beaad0a -------------------------------- Use a folio in validate_page_before_insert() to save two compound_head() calls. Link: https://lkml.kernel.org/r/20231118023232.1409103-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sidhartha Kumar Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- mm/memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index bfc25fa206a2..3bd587be33c1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1992,9 +1992,12 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, static int validate_page_before_insert(struct page *page) { - if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + struct folio *folio = page_folio(page); + + if (folio_test_anon(folio) || folio_test_slab(folio) || + page_has_type(page)) return -EINVAL; - flush_dcache_page(page); + flush_dcache_folio(folio); return 0; } -- Gitee From cecce010d7e45d1f7c05a1e1579be70f74bbd818 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:13 +0800 Subject: [PATCH 03/22] mm: convert ksm_might_need_to_copy() to work on folios mainline inclusion from mainline-v6.8-rc1 commit 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=96db66d9c8f3c1547325af01b1f328b85d6ee1b9 -------------------------------- Patch series "Finish two folio conversions". Most callers of page_add_new_anon_rmap() and lru_cache_add_inactive_or_unevictable() have been converted to their folio equivalents, but there are still a few stragglers. There's a bit of preparatory work in ksm and unuse_pte(), but after that it's pretty mechanical. This patch (of 9): Accept a folio as an argument and return a folio result. Removes a call to compound_head() in do_swap_page(), and prevents folio & page from getting out of sync in unuse_pte(). Reviewed-by: David Hildenbrand [willy@infradead.org: fix smatch warning] Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org [david@redhat.com: only adjust the page if the folio changed] Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton [ Dep-of: f00f48436c78 ] Signed-off-by: Liu Shixin --- include/linux/ksm.h | 6 +++--- mm/ksm.c | 21 +++++++++++---------- mm/memory.c | 11 +++++++---- mm/swapfile.c | 8 +++++--- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 2774e5c23500..d6225305fb7d 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm) * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); @@ -127,10 +127,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - return page; + return folio; } static inline void rmap_walk_ksm(struct folio *folio, diff --git a/mm/ksm.c b/mm/ksm.c index 8ecbf01041d2..de0de7ba1d6b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2793,30 +2793,30 @@ void __ksm_exit(struct mm_struct *mm) trace_ksm_exit(mm); } -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = page_folio(page); + struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) - return page; + return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (!anon_vma) { - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { - return page; /* still no need to copy it */ + return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) - return page; /* let do_swap_page report the error */ + return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); if (new_folio && @@ -2825,9 +2825,10 @@ struct page *ksm_might_need_to_copy(struct page *page, new_folio = NULL; } if (new_folio) { - if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + if (copy_mc_user_highpage(folio_page(new_folio, 0), page, + addr, vma)) { folio_put(new_folio); - memory_failure_queue(page_to_pfn(page), 0); + memory_failure_queue(folio_pfn(folio), 0); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); @@ -2838,7 +2839,7 @@ struct page *ksm_might_need_to_copy(struct page *page, #endif } - return new_folio ? &new_folio->page : NULL; + return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) diff --git a/mm/memory.c b/mm/memory.c index 3bd587be33c1..e212cfdb5c5b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4122,15 +4122,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { + folio = ksm_might_need_to_copy(folio, vma, vmf->address); + if (unlikely(!folio)) { ret = VM_FAULT_OOM; + folio = swapcache; goto out_page; - } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { + } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; + folio = swapcache; goto out_page; } - folio = page_folio(page); + if (folio != swapcache) + page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we diff --git a/mm/swapfile.c b/mm/swapfile.c index 941a98e7ed39..a677c16c48f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1897,11 +1897,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, int ret = 1; swapcache = page; - page = ksm_might_need_to_copy(page, vma, addr); - if (unlikely(!page)) + folio = ksm_might_need_to_copy(folio, vma, addr); + if (unlikely(!folio)) return -ENOMEM; - else if (unlikely(PTR_ERR(page) == -EHWPOISON)) + else if (unlikely(folio == ERR_PTR(-EHWPOISON))) hwpoisoned = true; + else + page = folio_file_page(folio, swp_offset(entry)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), -- Gitee From 9361d4bf34a51fa3254e8816aa61be6d34078f56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:14 +0800 Subject: [PATCH 04/22] mm: convert unuse_pte() to use a folio throughout mainline inclusion from mainline-v6.8-rc1 commit f00f48436c789af659047d3c5d6f6d17e640634e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f00f48436c789af659047d3c5d6f6d17e640634e -------------------------------- Saves about eight calls to compound_head(). Link: https://lkml.kernel.org/r/20231211162214.2146080-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Conflicts: mm/swapfile.c [ Context conflicts with commit 28c18a4c489e and 8fc2546f8508 Dep-of: 15bde4abab73 ] Signed-off-by: Liu Shixin --- mm/swapfile.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a677c16c48f4..6fa8fd11e911 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1889,21 +1889,25 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { - struct page *page = folio_file_page(folio, swp_offset(entry)); - struct page *swapcache; + struct page *page; + struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; - bool hwpoisoned = PageHWPoison(page); + bool hwpoisoned = false; int ret = 1; - swapcache = page; + swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; - else if (unlikely(folio == ERR_PTR(-EHWPOISON))) + else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { + hwpoisoned = true; + folio = swapcache; + } + + page = folio_file_page(folio, swp_offset(entry)); + if (PageHWPoison(page)) hwpoisoned = true; - else - page = folio_file_page(folio, swp_offset(entry)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), @@ -1914,13 +1918,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, old_pte = ptep_get(pte); - if (unlikely(hwpoisoned || !PageUptodate(page))) { + if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { - swp_entry = make_hwpoison_entry(swapcache); - page = swapcache; + swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } @@ -1934,28 +1937,28 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(folio_swap(entry, folio), page_folio(page)); + arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); add_reliable_page_counter(page, vma->vm_mm, 1); - get_page(page); - if (page == swapcache) { + folio_get(folio); + if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* - * See do_swap_page(): PageWriteback() would be problematic. - * However, we do a wait_on_page_writeback() just before this - * call and have the page locked. + * See do_swap_page(): writeback would be problematic. + * However, we do a folio_wait_writeback() just before this + * call and have the folio locked. */ - VM_BUG_ON_PAGE(PageWriteback(page), page); + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_new_anon_rmap(folio, vma, addr); + folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) @@ -1968,9 +1971,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, out: if (pte) pte_unmap_unlock(pte, ptl); - if (page != swapcache) { - unlock_page(page); - put_page(page); + if (folio != swapcache) { + folio_unlock(folio); + folio_put(folio); } return ret; } -- Gitee From 084161c7458b7aa0326aaaaaec9da31c51e866c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:15 +0800 Subject: [PATCH 05/22] mm: remove some calls to page_add_new_anon_rmap() mainline inclusion from mainline-v6.8-rc1 commit 2853b66b601a265306be709b4d86aaff7d92a0fc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2853b66b601a265306be709b4d86aaff7d92a0fc -------------------------------- We already have the folio in these functions, we just need to use it. folio_add_new_anon_rmap() didn't exist at the time they were converted to folios. Link: https://lkml.kernel.org/r/20231211162214.2146080-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: kernel/events/uprobes.c mm/memory.c mm/userfaultfd.c [ Context conflicts with commit 3a5a643c852a 5aeb1c95ccb1 51f12f7d1ec2 Dep-of: 508758960b8d ] Signed-off-by: Liu Shixin --- kernel/events/uprobes.c | 2 +- mm/memory.c | 2 +- mm/userfaultfd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a80072c3f888..88a6ad10dff0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -182,7 +182,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); add_reliable_folio_counter(new_folio, mm, folio_nr_pages(new_folio)); - page_add_new_anon_rmap(new_page, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/memory.c b/mm/memory.c index e212cfdb5c5b..e84e962994a0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4252,7 +4252,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { folio_add_anon_rmap_pte(folio, page, vma, vmf->address, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 315e59583bd6..588b2c4262f1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -117,7 +117,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - page_add_new_anon_rmap(page, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr); folio_add_lru_vma(folio, dst_vma); } -- Gitee From ebfd1d33a768c1e551574c3bc920d85a76f4bd02 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:16 +0800 Subject: [PATCH 06/22] mm: remove stale example from comment mainline inclusion from mainline-v6.8-rc1 commit b2926ac8178bf5c88ada4285f413f56c1cafc592 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b2926ac8178bf5c88ada4285f413f56c1cafc592 -------------------------------- folio_add_new_anon_rmap() no longer works this way, so just remove the entire example. Link: https://lkml.kernel.org/r/20231211162214.2146080-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Ralph Campbell Signed-off-by: Andrew Morton Conflicts: mm/memremap.c [ Context conflicts with commit d5ae3c96ff01 ] Signed-off-by: Liu Shixin --- mm/memremap.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index 7b7e59841250..8c5ebe5b0fc7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -487,21 +487,11 @@ void free_zone_device_folio(struct folio *folio) } /* - * When a device managed page is freed, the page->mapping field + * When a device managed page is freed, the folio->mapping field * may still contain a (stale) mapping value. For example, the - * lower bits of page->mapping may still identify the page as an - * anonymous page. Ultimately, this entire field is just stale - * and wrong, and it will cause errors if not cleared. One - * example is: - * - * migrate_vma_pages() - * migrate_vma_insert_page() - * page_add_new_anon_rmap() - * __page_set_anon_rmap() - * ...checks page->mapping, via PageAnon(page) call, - * and incorrectly concludes that the page is an - * anonymous page. Therefore, it incorrectly, - * silently fails to set up the new anon rmap. + * lower bits of folio->mapping may still identify the folio as an + * anonymous folio. Ultimately, this entire field is just stale + * and wrong, and it will cause errors if not cleared. * * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need -- Gitee From 05b1a70a911508470383b2c07a3a90d381482400 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:17 +0800 Subject: [PATCH 07/22] mm: remove references to page_add_new_anon_rmap in comments mainline inclusion from mainline-v6.8-rc1 commit cb9089babc91f7ffc785d51a0fa567365b0e7751 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cb9089babc91f7ffc785d51a0fa567365b0e7751 -------------------------------- Refer to folio_add_new_anon_rmap() instead. Link: https://lkml.kernel.org/r/20231211162214.2146080-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: mm/rmap.c [ Context conflicts with commit 758b2d1d5c4e ] Signed-off-by: Liu Shixin --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index d3bd172275f2..7679c272f3dd 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1245,9 +1245,9 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, * We have exclusion against folio_add_anon_rmap_*() because the caller * always holds the page locked. * - * We have exclusion against page_add_new_anon_rmap because those pages + * We have exclusion against folio_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked - * over the call to page_add_new_anon_rmap. + * over the call to folio_add_new_anon_rmap. */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); -- Gitee From fc17425e30b553919f3b2bc8d5686b2ad6933492 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:18 +0800 Subject: [PATCH 08/22] mm: convert migrate_vma_insert_page() to use a folio mainline inclusion from mainline-v6.8-rc1 commit d3b082736518562f4eed185e1a67f28d20635fef category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d3b082736518562f4eed185e1a67f28d20635fef -------------------------------- Replaces five calls to compound_head() with one. Link: https://lkml.kernel.org/r/20231211162214.2146080-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Alistair Popple Signed-off-by: Andrew Morton Conflicts: mm/migrate_device.c [ Context conflicts with commit 8fc2546f8508. Dep-of: 15bde4abab73 ] Signed-off-by: Liu Shixin --- mm/migrate_device.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 40032b85ab4b..5c9400931b74 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -567,6 +567,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, struct page *page, unsigned long *src) { + struct folio *folio = page_folio(page); struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; bool flush = false; @@ -599,17 +600,17 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto abort; if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto abort; /* - * The memory barrier inside __SetPageUptodate makes sure that - * preceding stores to the page contents become visible before + * The memory barrier inside __folio_mark_uptodate makes sure that + * preceding stores to the folio contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); - if (is_device_private_page(page)) { + if (folio_is_device_private(folio)) { swp_entry_t swp_entry; if (vma->vm_flags & VM_WRITE) @@ -620,8 +621,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { - if (is_zone_device_page(page) && - !is_device_coherent_page(page)) { + if (folio_is_zone_device(folio) && + !folio_is_device_coherent(folio)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } @@ -656,10 +657,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - page_add_new_anon_rmap(page, vma, addr); - if (!is_zone_device_page(page)) - lru_cache_add_inactive_or_unevictable(page, vma); - get_page(page); + folio_add_new_anon_rmap(folio, vma, addr); + if (!folio_is_zone_device(folio)) + folio_add_lru_vma(folio, vma); + folio_get(folio); if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); -- Gitee From 6263691ac90f6c3d695d4ef0d5894f104b91f655 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:19 +0800 Subject: [PATCH 09/22] mm: convert collapse_huge_page() to use a folio mainline inclusion from mainline-v6.8-rc1 commit 5432726848bb27a01badcbc93b596f39ee6c5ffb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5432726848bb27a01badcbc93b596f39ee6c5ffb -------------------------------- Replace three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20231211162214.2146080-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: mm/khugepaged.c [ Context conflicts with commit 8fc2546f8508. Dep-of: 15bde4abab73 ] Signed-off-by: Liu Shixin --- mm/khugepaged.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d13033eb7eaa..13c5935e3a41 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1123,6 +1123,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; + struct folio *folio; struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; @@ -1242,13 +1243,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; + folio = page_folio(hpage); /* - * spin_lock() below is not the equivalent of smp_wmb(), but - * the smp_wmb() inside __SetPageUptodate() can be reused to - * avoid the copy_huge_page writes to become visible after - * the set_pmd_at() write. + * The smp_wmb() inside __folio_mark_uptodate() ensures the + * copy_huge_page writes become visible before the set_pmd_at() + * write. */ - __SetPageUptodate(hpage); + __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); @@ -1257,8 +1258,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); add_reliable_page_counter(hpage, vma->vm_mm, HPAGE_PMD_NR); - page_add_new_anon_rmap(hpage, vma, address); - lru_cache_add_inactive_or_unevictable(hpage, vma); + folio_add_new_anon_rmap(folio, vma, address); + folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); -- Gitee From a3a2a23f9d04fc75d2ea81ed2ff0e963f80f884f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 2 Sep 2024 20:55:20 +0800 Subject: [PATCH 10/22] mm: userswap: page_add_new_anon_rmap() -> folio_add_new_anon_rmap() hulk inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT -------------------------------- page_add_new_anon_rmap() is only used by userswap for now. Convert it to folio_add_new_anon_rmap(). Signed-off-by: Liu Shixin --- mm/userswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/userswap.c b/mm/userswap.c index 20ee53dc5eea..4951a3f66582 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -194,7 +194,7 @@ static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - page_add_new_anon_rmap(page, vma, addr); + folio_add_new_anon_rmap(page_folio(page), vma, addr); dst_pte = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); @@ -220,7 +220,7 @@ static void uswap_map_anon_page(struct mm_struct *mm, set_pte_at(mm, addr, pte, old_pte); inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - page_add_new_anon_rmap(page, vma, addr); + folio_add_new_anon_rmap(page_folio(page), vma, addr); pte_unmap_unlock(pte, ptl); } @@ -535,7 +535,7 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - page_add_new_anon_rmap(page, dst_vma, dst_addr); + folio_add_new_anon_rmap(page_folio(page), dst_vma, dst_addr); set_pte_at(mm, dst_addr, pte, dst_pte); /* No need to invalidate - it was non-present before */ -- Gitee From 55782a54e8d5374f3a99de4891e3169973f3f74e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Sep 2024 20:55:21 +0800 Subject: [PATCH 11/22] mm: remove page_add_new_anon_rmap and lru_cache_add_inactive_or_unevictable mainline inclusion from mainline-v6.8-rc1 commit cafa8e37a2ebd344ae0774324c21f46640bbaab3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cafa8e37a2ebd344ae0774324c21f46640bbaab3 -------------------------------- All callers have now been converted to folio_add_new_anon_rmap() and folio_add_lru_vma() so we can remove the wrapper. Link: https://lkml.kernel.org/r/20231211162214.2146080-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: include/linux/rmap.h [ Context conflicts with commit 758b2d1d5c4e ] Signed-off-by: Liu Shixin --- include/linux/rmap.h | 2 -- include/linux/swap.h | 3 --- mm/folio-compat.c | 16 ---------------- 3 files changed, 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 43cdb662ce06..c4092c494cd1 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,8 +235,6 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, diff --git a/include/linux/swap.h b/include/linux/swap.h index 54fa8f4558c7..eb1a190ec521 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -424,9 +424,6 @@ void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -extern void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma); - /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 10c3247542cb..a546271db69b 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -77,12 +77,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) -{ - folio_add_lru_vma(page_folio(page), vma); -} - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { @@ -122,13 +116,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_MMU -void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, - unsigned long address) -{ - VM_BUG_ON_PAGE(PageTail(page), page); - - return folio_add_new_anon_rmap((struct folio *)page, vma, address); -} -#endif -- Gitee From c0fc100d0267ce28f717f1b5f2dcd2cda1e77ca6 Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Mon, 2 Sep 2024 20:55:22 +0800 Subject: [PATCH 12/22] mm: swap: introduce swap_free_nr() for batched swap_free() mainline inclusion from mainline-v6.11-rc1 commit ebfba0045176cb013f49cb3e5bd9f0b16eba203c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ebfba0045176cb013f49cb3e5bd9f0b16eba203c -------------------------------- Patch series "large folios swap-in: handle refault cases first", v5. This patchset is extracted from the large folio swapin series[1], primarily addressing the handling of scenarios involving large folios in the swap cache. Currently, it is particularly focused on addressing the refaulting of mTHP, which is still undergoing reclamation. This approach aims to streamline code review and expedite the integration of this segment into the MM tree. It relies on Ryan's swap-out series[2], leveraging the helper function swap_pte_batch() introduced by that series. Presently, do_swap_page only encounters a large folio in the swap cache before the large folio is released by vmscan. However, the code should remain equally useful once we support large folio swap-in via swapin_readahead(). This approach can effectively reduce page faults and eliminate most redundant checks and early exits for MTE restoration in recent MTE patchset[3]. The large folio swap-in for SWP_SYNCHRONOUS_IO and swapin_readahead() will be split into separate patch sets and sent at a later time. [1] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [2] https://lore.kernel.org/linux-mm/20240408183946.2991168-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20240322114136.61386-1-21cnbao@gmail.com/ This patch (of 6): While swapping in a large folio, we need to free swaps related to the whole folio. To avoid frequently acquiring and releasing swap locks, it is better to introduce an API for batched free. Furthermore, this new function, swap_free_nr(), is designed to efficiently handle various scenarios for releasing a specified number, nr, of swap entries. Link: https://lkml.kernel.org/r/20240529082824.150954-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240529082824.150954-2-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Reviewed-by: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Cc: Andreas Larsson Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Khalid Aziz Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- include/linux/swap.h | 5 +++++ mm/swapfile.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index eb1a190ec521..b8f2ff0fd9e1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -511,6 +511,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -601,6 +602,10 @@ static inline void swap_free(swp_entry_t swp) { } +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +{ +} + static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } diff --git a/mm/swapfile.c b/mm/swapfile.c index 6fa8fd11e911..5d508f808b0a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1439,6 +1439,53 @@ void swap_free(swp_entry_t entry) __swap_entry_free(p, entry); } +static void cluster_swap_free_nr(struct swap_info_struct *sis, + unsigned long offset, int nr_pages) +{ + struct swap_cluster_info *ci; + DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; + int i, nr; + + ci = lock_cluster_or_swap_info(sis, offset); + while (nr_pages) { + nr = min(BITS_PER_LONG, nr_pages); + for (i = 0; i < nr; i++) { + if (!__swap_entry_free_locked(sis, offset + i, 1)) + bitmap_set(to_free, i, 1); + } + if (!bitmap_empty(to_free, BITS_PER_LONG)) { + unlock_cluster_or_swap_info(sis, ci); + for_each_set_bit(i, to_free, BITS_PER_LONG) + free_swap_slot(swp_entry(sis->type, offset + i)); + if (nr == nr_pages) + return; + bitmap_clear(to_free, 0, BITS_PER_LONG); + ci = lock_cluster_or_swap_info(sis, offset); + } + offset += nr; + nr_pages -= nr; + } + unlock_cluster_or_swap_info(sis, ci); +} + +void swap_free_nr(swp_entry_t entry, int nr_pages) +{ + int nr; + struct swap_info_struct *sis; + unsigned long offset = swp_offset(entry); + + sis = _swap_info_get(entry); + if (!sis) + return; + + while (nr_pages) { + nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + cluster_swap_free_nr(sis, offset, nr); + offset += nr; + nr_pages -= nr; + } +} + /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -- Gitee From fecd342b4d48192bbd53e20dba715bc6df43713a Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:23 +0800 Subject: [PATCH 13/22] mm: remove the implementation of swap_free() and always use swap_free_nr() mainline inclusion from mainline-v6.11-rc1 commit 54f7a49c20ebb5189980c53e6e66709d22bee572 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=54f7a49c20ebb5189980c53e6e66709d22bee572 -------------------------------- To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- include/linux/swap.h | 10 +++++----- kernel/power/swap.c | 5 ++--- mm/swapfile.c | 17 ++++------------- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b8f2ff0fd9e1..9ac07b26b472 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,6 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -598,10 +597,6 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) -{ -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -650,6 +645,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 37e4b43abc5c..1ade08e72e82 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -201,12 +201,11 @@ void free_all_swap_pages(int swap) while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; - unsigned long offset; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_nr(swp_entry(swap, ext->start), + ext->end - ext->start + 1); kfree(ext); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 5d508f808b0a..10c867004488 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1426,19 +1426,6 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) swap_range_free(p, offset, 1); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - __swap_entry_free(p, entry); -} - static void cluster_swap_free_nr(struct swap_info_struct *sis, unsigned long offset, int nr_pages) { @@ -1468,6 +1455,10 @@ static void cluster_swap_free_nr(struct swap_info_struct *sis, unlock_cluster_or_swap_info(sis, ci); } +/* + * Caller has made sure that the swap device corresponding to entry + * is still around or has not been recycled. + */ void swap_free_nr(swp_entry_t entry, int nr_pages) { int nr; -- Gitee From b29841c2d6286ff53ea87ce056cec80b4d6c1b59 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:24 +0800 Subject: [PATCH 14/22] mm: introduce pte_move_swp_offset() helper which can move offset bidirectionally mainline inclusion from mainline-v6.11-rc1 commit 3f9abcaa3e9c3910893ccbe6085aa0452e72896d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3f9abcaa3e9c3910893ccbe6085aa0452e72896d -------------------------------- There could arise a necessity to obtain the first pte_t from a swap pte_t located in the middle. For instance, this may occur within the context of do_swap_page(), where a page fault can potentially occur in any PTE of a large folio. To address this, the following patch introduces pte_move_swp_offset(), a function capable of bidirectional movement by a specified delta argument. Consequently, pte_next_swp_offset() will directly invoke it with delta = 1. Link: https://lkml.kernel.org/r/20240529082824.150954-4-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- mm/internal.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 6479c376ffdf..e5b541e3f67e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,18 +208,21 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, } /** - * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * pte_move_swp_offset - Move the swap entry offset field of a swap pte + * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. + * @delta: The direction and the offset we are moving; forward if delta + * is positive; backward if delta is negative * - * Increments the swap offset, while maintaining all other fields, including + * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ -static inline pte_t pte_next_swp_offset(pte_t pte) +static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), - (swp_offset(entry) + 1))); + (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); @@ -231,6 +234,20 @@ static inline pte_t pte_next_swp_offset(pte_t pte) return new; } + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + return pte_move_swp_offset(pte, 1); +} + /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. -- Gitee From c722512542a056b1c3a77b6f45d72843b4915ca0 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:25 +0800 Subject: [PATCH 15/22] mm: introduce arch_do_swap_page_nr() which allows restore metadata for nr pages mainline inclusion from mainline-v6.11-rc1 commit 29f252cdc293f4a50b5d3dcbed53701d8444614d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=29f252cdc293f4a50b5d3dcbed53701d8444614d -------------------------------- Should do_swap_page() have the capability to directly map a large folio, metadata restoration becomes necessary for a specified number of pages denoted as nr. It's important to highlight that metadata restoration is solely required by the SPARC platform, which, however, does not enable THP_SWAP. Consequently, in the present kernel configuration, there exists no practical scenario where users necessitate the restoration of nr metadata. Platforms implementing THP_SWAP might invoke this function with nr values exceeding 1, subsequent to do_swap_page() successfully mapping an entire large folio. Nonetheless, their arch_do_swap_page_nr() functions remain empty. Link: https://lkml.kernel.org/r/20240529082824.150954-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: Khalid Aziz Cc: "David S. Miller" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- include/linux/pgtable.h | 26 ++++++++++++++++++++------ mm/memory.c | 3 ++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index db4faa88865d..91956dc5fea0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1075,6 +1075,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1083,12 +1092,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif diff --git a/mm/memory.c b/mm/memory.c index e84e962994a0..2582df618460 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4262,7 +4262,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, + pte, vmf->orig_pte, 1); folio_unlock(folio); if (folio != swapcache && swapcache) { -- Gitee From f77604c6c254ad03a0c79fb32dd3e41fec4eeca2 Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Mon, 2 Sep 2024 20:55:26 +0800 Subject: [PATCH 16/22] mm: swap: make should_try_to_free_swap() support large-folio mainline inclusion from mainline-v6.11-rc1 commit 4c3f966436873435600b00e5c2c6c8933607e236 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c3f966436873435600b00e5c2c6c8933607e236 -------------------------------- The function should_try_to_free_swap() operates under the assumption that swap-in always occurs at the normal page granularity, i.e., folio_nr_pages() = 1. However, in reality, for large folios, add_to_swap_cache() will invoke folio_ref_add(folio, nr). To accommodate large folio swap-in, this patch eliminates this assumption. Link: https://lkml.kernel.org/r/20240529082824.150954-6-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Acked-by: Chris Li Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Cc: Andreas Larsson Cc: Baolin Wang Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 2582df618460..2be5c3ed8857 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3868,7 +3868,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && - folio_ref_count(folio) == 2; + folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) -- Gitee From 505f9fcf99494bf3daeb2176ff1994c1ee261420 Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Mon, 2 Sep 2024 20:55:27 +0800 Subject: [PATCH 17/22] mm: swap: entirely map large folios found in swapcache mainline inclusion from mainline-v6.11-rc1 commit 508758960b8d89fa464abce2f9897973c8e8d4f0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=508758960b8d89fa464abce2f9897973c8e8d4f0 -------------------------------- When a large folio is found in the swapcache, the current implementation requires calling do_swap_page() nr_pages times, resulting in nr_pages page faults. This patch opts to map the entire large folio at once to minimize page faults. Additionally, redundant checks and early exits for ARM64 MTE restoring are removed. Link: https://lkml.kernel.org/r/20240529082824.150954-7-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Conflicts: mm/memory.c [ Conflicts with commit 8fc2546f8508 to fix counter ] Signed-off-by: Liu Shixin --- mm/memory.c | 61 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2be5c3ed8857..8221ad95022d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3959,6 +3959,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; + int nr_pages; + unsigned long page_idx; + unsigned long address; + pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; @@ -4161,6 +4165,38 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + nr_pages = 1; + page_idx = 0; + address = vmf->address; + ptep = vmf->pte; + if (folio_test_large(folio) && folio_test_swapcache(folio)) { + int nr = folio_nr_pages(folio); + unsigned long idx = folio_page_idx(folio, page); + unsigned long folio_start = address - idx * PAGE_SIZE; + unsigned long folio_end = folio_start + nr * PAGE_SIZE; + pte_t *folio_ptep; + pte_t folio_pte; + + if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) + goto check_folio; + if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) + goto check_folio; + + folio_ptep = vmf->pte - idx; + folio_pte = ptep_get(folio_ptep); + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto check_folio; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + nr_pages = nr; + entry = folio->swap; + page = &folio->page; + } + +check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is @@ -4220,13 +4256,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * We're already holding a reference on the page but haven't mapped it * yet. */ - swap_free(entry); + swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - add_reliable_folio_counter(folio, vma->vm_mm, 1); - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + add_reliable_folio_counter(folio, vma->vm_mm, nr_pages); + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); /* @@ -4243,27 +4279,28 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } rmap_flags |= RMAP_EXCLUSIVE; } - flush_icache_page(vma, page); + folio_ref_add(folio, nr_pages - 1); + flush_icache_pages(vma, page, nr_pages); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - vmf->orig_pte = pte; + vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, address); folio_add_lru_vma(folio, vma); } else { - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, - pte, vmf->orig_pte, 1); + set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); + arch_do_swap_page_nr(vma->vm_mm, vma, address, + pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { @@ -4287,7 +4324,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- Gitee From d081bee4a6ea2900aac1ab85295ae52b8f89172e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:28 +0800 Subject: [PATCH 18/22] mm: swap: reuse exclusive folio directly instead of wp page faults mainline inclusion from mainline-v6.11-rc1 commit c18160dba5ff633f7ccd779f23ee97603eda0094 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c18160dba5ff633f7ccd779f23ee97603eda0094 -------------------------------- After swapping out, we perform a swap-in operation. If we first read and then write, we encounter a major fault in do_swap_page for reading, along with additional minor faults in do_wp_page for writing. However, the latter appears to be unnecessary and inefficient. Instead, we can directly reuse in do_swap_page and completely eliminate the need for do_wp_page. This patch achieves that optimization specifically for exclusive folios. The following microbenchmark demonstrates the significant reduction in minor faults. #define DATA_SIZE (2UL * 1024 * 1024) #define PAGE_SIZE (4UL * 1024) static void *read_write_data(char *addr) { char tmp; for (int i = 0; i < DATA_SIZE; i += PAGE_SIZE) { tmp = *(volatile char *)(addr + i); *(volatile char *)(addr + i) = tmp; } } int main(int argc, char **argv) { struct rusage ru; char *addr = mmap(NULL, DATA_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); memset(addr, 0x11, DATA_SIZE); do { long old_ru_minflt, old_ru_majflt; long new_ru_minflt, new_ru_majflt; madvise(addr, DATA_SIZE, MADV_PAGEOUT); getrusage(RUSAGE_SELF, &ru); old_ru_minflt = ru.ru_minflt; old_ru_majflt = ru.ru_majflt; read_write_data(addr); getrusage(RUSAGE_SELF, &ru); new_ru_minflt = ru.ru_minflt; new_ru_majflt = ru.ru_majflt; printf("minor faults:%ld major faults:%ld\n", new_ru_minflt - old_ru_minflt, new_ru_majflt - old_ru_majflt); } while(0); return 0; } w/o patch, / # ~/a.out minor faults:512 major faults:512 w/ patch, / # ~/a.out minor faults:0 major faults:512 Minor faults decrease to 0! Link: https://lkml.kernel.org/r/20240602004502.26895-1-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Chris Li Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Ryan Roberts Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Conflicts: mm/memory.c [ Context conflicts with commit 8fc2546f8508 ] Signed-off-by: Liu Shixin --- mm/memory.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8221ad95022d..72f575909f7f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4264,6 +4264,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) add_reliable_folio_counter(folio, vma->vm_mm, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); + if (pte_swp_soft_dirty(vmf->orig_pte)) + pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) + pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are @@ -4273,18 +4277,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { - if (vmf->flags & FAULT_FLAG_WRITE) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - vmf->flags &= ~FAULT_FLAG_WRITE; + if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && + !vma_soft_dirty_enabled(vma)) { + pte = pte_mkwrite(pte, vma); + if (vmf->flags & FAULT_FLAG_WRITE) { + pte = pte_mkdirty(pte); + vmf->flags &= ~FAULT_FLAG_WRITE; + } } rmap_flags |= RMAP_EXCLUSIVE; } folio_ref_add(folio, nr_pages - 1); flush_icache_pages(vma, page, nr_pages); - if (pte_swp_soft_dirty(vmf->orig_pte)) - pte = pte_mksoft_dirty(pte); - if (pte_swp_uffd_wp(vmf->orig_pte)) - pte = pte_mkuffd_wp(pte); vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ -- Gitee From 286201c8acfaa70ea279b99222ce43ea0d32ffc1 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 2 Sep 2024 20:55:29 +0800 Subject: [PATCH 19/22] mm: rmap: abstract updating per-node and per-memcg stats mainline inclusion from mainline-v6.11-rc1 commit 15c0536fb57fd989e24335020a443486bac01dac category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15c0536fb57fd989e24335020a443486bac01dac -------------------------------- A lot of intricacies go into updating the stats when adding or removing mappings: which stat index to use and which function. Abstract this away into a new static helper in rmap.c, __folio_mod_stat(). This adds an unnecessary call to folio_test_anon() in __folio_add_anon_rmap() and __folio_add_file_rmap(). However, the folio struct should already be in the cache at this point, so it shouldn't cause any noticeable overhead. No functional change intended. [hughd@google.com: fix /proc/meminfo] Link: https://lkml.kernel.org/r/49914517-dfc7-e784-fde0-0e08fafbecc2@google.com Link: https://lkml.kernel.org/r/20240506211333.346605-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Signed-off-by: Hugh Dickins Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Conflicts: mm/rmap.c [ Conflicts due to miss commit 4f687281012e which is bugfix for another feature ] Signed-off-by: Liu Shixin --- mm/rmap.c | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7679c272f3dd..b1baaacd9f59 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1255,6 +1255,27 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, page); } +static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) +{ + int idx; + + if (nr) { + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, nr); + } + if (nr_pmdmapped) { + if (folio_test_anon(folio)) { + idx = NR_ANON_THPS; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } else { + /* NR_*_PMDMAPPED are not maintained per-memcg */ + idx = folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } + } +} + static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) @@ -1262,10 +1283,6 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, int i, nr, nr_pmdmapped = 0; nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); - if (nr) - __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1283,6 +1300,8 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, __page_check_anon_rmap(folio, page, vma, address); } + __folio_mod_stat(folio, nr, nr_pmdmapped); + if (flags & RMAP_EXCLUSIVE) { switch (level) { case RMAP_LEVEL_PTE: @@ -1379,6 +1398,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { int nr = folio_nr_pages(folio); + int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || @@ -1407,10 +1427,10 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); SetPageAnonExclusive(&folio->page); - __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); + nr_pmdmapped = nr; } - __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); + __folio_mod_stat(folio, nr, nr_pmdmapped); } static __always_inline void __folio_add_file_rmap(struct folio *folio, @@ -1422,11 +1442,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); - if (nr) - __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); + __folio_mod_stat(folio, nr, nr_pmdmapped); /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) @@ -1477,7 +1493,6 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, atomic_t *mapped = &folio->_nr_pages_mapped; int last, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; - enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1519,19 +1534,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, break; } - if (nr_pmdmapped) { - if (folio_test_anon(folio)) - idx = NR_ANON_THPS; - else if (folio_test_swapbacked(folio)) - idx = NR_SHMEM_PMDMAPPED; - else - idx = NR_FILE_PMDMAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); - } if (nr) { - idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr); - /* * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page @@ -1543,6 +1546,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, list_empty(&folio->_deferred_list)) deferred_split_folio(folio); } + __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* * It would be tidy to reset folio_test_anon mapping when fully -- Gitee From e9e44c87715bd71893c900d182d9a5f991f36e5b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:30 +0800 Subject: [PATCH 20/22] mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.11-rc1 commit 15bde4abab734c687c1f81704886aba3a70c268e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15bde4abab734c687c1f81704886aba3a70c268e -------------------------------- Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Conflicts: kernel/events/uprobes.c mm/khugepaged.c mm/memory.c mm/migrate_device.c mm/rmap.c mm/userswap.c [ Context conflicts in uprobes.c, khugepaged.c, memory.c migrate_device.c with commit 3a5a643c852a Context conflicts in memory.c due to miss commit f7842747d13d Context conflicts in rmap.c due to miss commit 05c5323b2a34 Fix folio_add_new_anon_rmap() in userswap.c ] Signed-off-by: Liu Shixin --- include/linux/rmap.h | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 25 ++++++++++++++++--------- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- mm/userswap.c | 6 +++--- 10 files changed, 31 insertions(+), 24 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c4092c494cd1..c422af4855cf 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -236,7 +236,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 88a6ad10dff0..8986c452ff07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -182,7 +182,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); add_reliable_folio_counter(new_folio, mm, folio_nr_pages(new_folio)); - folio_add_new_anon_rmap(new_folio, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fec3ee2c020b..af6a5c840e27 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1187,7 +1187,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 13c5935e3a41..8006b13304de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1258,7 +1258,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); add_reliable_page_counter(hpage, vma->vm_mm, HPAGE_PMD_NR); - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index 72f575909f7f..5c0b6d08b68f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -923,7 +923,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); - folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; @@ -3363,7 +3363,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); - folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary @@ -4293,7 +4293,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, @@ -4549,7 +4549,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); #endif add_reliable_folio_counter(folio, vma->vm_mm, nr_pages); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) @@ -4749,7 +4749,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, add_reliable_folio_counter(folio, vma->vm_mm, nr); if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5c9400931b74..6998768a7297 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -657,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); if (!folio_is_zone_device(folio)) folio_add_lru_vma(folio, vma); folio_get(folio); diff --git a/mm/rmap.c b/mm/rmap.c index b1baaacd9f59..32ac6796113a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1386,30 +1386,35 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * The folio does not have to be locked. + * The folio doesn't necessarily need to be locked while it's exclusive + * unless two threads map it concurrently. However, the folio must be + * locked if it's shared. * - * If the folio is pmd-mappable, it is accounted as a THP. As the folio - * is new, it's assumed to be mapped exclusively by a single process. + * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, rmap_t flags) { - int nr = folio_nr_pages(folio); + const int nr = folio_nr_pages(folio); + const bool exclusive = flags & RMAP_EXCLUSIVE; int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); __folio_set_swapbacked(folio); - __folio_set_anon(folio, vma, address, true); + __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; @@ -1418,7 +1423,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); - SetPageAnonExclusive(page); + if (exclusive) + SetPageAnonExclusive(page); } atomic_set(&folio->_nr_pages_mapped, nr); @@ -1426,7 +1432,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 10c867004488..1b2b3bea06c8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1995,7 +1995,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 588b2c4262f1..4ab24c56f660 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -117,7 +117,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - folio_add_new_anon_rmap(folio, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } diff --git a/mm/userswap.c b/mm/userswap.c index 4951a3f66582..22e3f147ce5f 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -194,7 +194,7 @@ static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - folio_add_new_anon_rmap(page_folio(page), vma, addr); + folio_add_new_anon_rmap(page_folio(page), vma, addr, RMAP_EXCLUSIVE); dst_pte = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); @@ -220,7 +220,7 @@ static void uswap_map_anon_page(struct mm_struct *mm, set_pte_at(mm, addr, pte, old_pte); inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - folio_add_new_anon_rmap(page_folio(page), vma, addr); + folio_add_new_anon_rmap(page_folio(page), vma, addr, RMAP_EXCLUSIVE); pte_unmap_unlock(pte, ptl); } @@ -535,7 +535,7 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, inc_mm_counter(mm, MM_ANONPAGES); add_reliable_page_counter(page, mm, 1); - folio_add_new_anon_rmap(page_folio(page), dst_vma, dst_addr); + folio_add_new_anon_rmap(page_folio(page), dst_vma, dst_addr, RMAP_EXCLUSIVE); set_pte_at(mm, dst_addr, pte, dst_pte); /* No need to invalidate - it was non-present before */ -- Gitee From b14604b227df1eff51ec7ec0afa6f6f8fe17d418 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:31 +0800 Subject: [PATCH 21/22] mm: use folio_add_new_anon_rmap() if folio_test_anon(folio)==false mainline inclusion from mainline-v6.11-rc1 commit 9ae2feacedde16067014f11414675f385c68eedc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9ae2feacedde16067014f11414675f385c68eedc -------------------------------- For the !folio_test_anon(folio) case, we can now invoke folio_add_new_anon_rmap() with the rmap flags set to either EXCLUSIVE or non-EXCLUSIVE. This action will suppress the VM_WARN_ON_FOLIO check within __folio_add_anon_rmap() while initiating the process of bringing up mTHP swapin. static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) { ... if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(folio_test_large(folio) && level != RMAP_LEVEL_PMD, folio); } ... } It also improves the code's readability. Currently, all new anonymous folios calling folio_add_anon_rmap_ptes() are order-0. This ensures that new folios cannot be partially exclusive; they are either entirely exclusive or entirely shared. A useful comment from Hugh's fix: : Commit "mm: use folio_add_new_anon_rmap() if folio_test_anon(folio)== : false" has extended folio_add_new_anon_rmap() to use on non-exclusive : folios, already visible to others in swap cache and on LRU. : : That renders its non-atomic __folio_set_swapbacked() unsafe: it risks : overwriting concurrent atomic operations on folio->flags, losing bits : added or restoring bits cleared. Since it's only used in this risky way : when folio_test_locked and !folio_test_anon, many such races are excluded; : but, for example, isolations by folio_test_clear_lru() are vulnerable, and : setting or clearing active. : : It could just use the atomic folio_set_swapbacked(); but this function : does try to avoid atomics where it can, so use a branch instead: just : avoid setting swapbacked when it is already set, that is good enough. : (Swapbacked is normally stable once set: lazyfree can undo it, but only : later, when found anon in a page table.) : : This fixes a lot of instability under compaction and swapping loads: : assorted "Bad page"s, VM_BUG_ON_FOLIO()s, apparently even page double : frees - though I've not worked out what races could lead to the latter. [akpm@linux-foundation.org: comment fixes, per David and akpm] [v-songbaohua@oppo.com: lock the folio to avoid race] Link: https://lkml.kernel.org/r/20240622032002.53033-1-21cnbao@gmail.com [hughd@google.com: folio_add_new_anon_rmap() careful __folio_set_swapbacked()] Link: https://lkml.kernel.org/r/f3599b1d-8323-0dc5-e9e0-fdb3cfc3dd5a@google.com Link: https://lkml.kernel.org/r/20240617231137.80726-3-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- mm/memory.c | 9 +++++++++ mm/rmap.c | 4 +++- mm/swapfile.c | 14 ++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5c0b6d08b68f..2771c10454e1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4295,6 +4295,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + } else if (!folio_test_anon(folio)) { + /* + * We currently only expect small !anon folios, which are either + * fully exclusive or fully shared. If we ever get large folios + * here, we have to be careful. + */ + VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); diff --git a/mm/rmap.c b/mm/rmap.c index 32ac6796113a..42621dbffcac 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1407,7 +1407,9 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); - __folio_set_swapbacked(folio); + + if (!folio_test_swapbacked(folio)) + __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b2b3bea06c8..cfb768d3ed73 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1992,8 +1992,18 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; - - folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); + /* + * We currently only expect small !anon folios, which are either + * fully exclusive or fully shared. If we ever get large folios + * here, we have to be careful. + */ + if (!folio_test_anon(folio)) { + VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); + } else { + folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); + } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); -- Gitee From a0839471de11cbc55e3b8b5fac0821267ad6f3a2 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 2 Sep 2024 20:55:32 +0800 Subject: [PATCH 22/22] mm: remove folio_test_anon(folio)==false path in __folio_add_anon_rmap() mainline inclusion from mainline-v6.11-rc1 commit 4c1171f1d22484f2419b07ab688548350db521cb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAJ5MT Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c1171f1d22484f2419b07ab688548350db521cb -------------------------------- The folio_test_anon(folio)==false cases has been relocated to folio_add_new_anon_rmap(). Additionally, four other callers consistently pass anonymous folios. stack 1: remove_migration_pmd -> folio_add_anon_rmap_pmd -> __folio_add_anon_rmap stack 2: __split_huge_pmd_locked -> folio_add_anon_rmap_ptes -> __folio_add_anon_rmap stack 3: remove_migration_pmd -> folio_add_anon_rmap_pmd -> __folio_add_anon_rmap (RMAP_LEVEL_PMD) stack 4: try_to_merge_one_page -> replace_page -> folio_add_anon_rmap_pte -> __folio_add_anon_rmap __folio_add_anon_rmap() only needs to handle the cases folio_test_anon(folio)==true now. We can remove the !folio_test_anon(folio)) path within __folio_add_anon_rmap() now. Link: https://lkml.kernel.org/r/20240617231137.80726-4-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Liu Shixin --- mm/rmap.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 42621dbffcac..de385e29916b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1282,23 +1282,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, { int i, nr, nr_pmdmapped = 0; + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (unlikely(!folio_test_anon(folio))) { - VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); - /* - * For a PTE-mapped large folio, we only know that the single - * PTE is exclusive. Further, __folio_set_anon() might not get - * folio->index right when not given the address of the head - * page. - */ - VM_WARN_ON_FOLIO(folio_test_large(folio) && - level != RMAP_LEVEL_PMD, folio); - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - } else if (likely(!folio_test_ksm(folio))) { + if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); - } __folio_mod_stat(folio, nr, nr_pmdmapped); -- Gitee