From 29a398092836a6faa75bedffe53ed7551ae0ac2d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:47 +0300 Subject: [PATCH 1/3] mm: fix endless reclaim on machines with unaccepted memory stable inclusion from stable-v6.6.48 commit b77471c6760a22dcfd72a3528f9a8a19d786c7cb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAWEBV Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=b77471c6760a22dcfd72a3528f9a8a19d786c7cb -------------------------------- [ Upstream commit 807174a93d24c456503692dc3f5af322ee0b640a ] Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path. The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries. Make unaccepted memory free from watermark check point of view. This way unaccepted memory will never be the trigger of memory reclaim. Accept more memory in the get_page_from_freelist() if needed. Link: https://lkml.kernel.org/r/20240809114854.3745464-2-kirill.shutemov@linux.intel.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Signed-off-by: Kirill A. Shutemov Reported-by: Jianxiong Gao Acked-by: David Hildenbrand Tested-by: Jianxiong Gao Cc: Borislav Petkov Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Cc: [6.5+] Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin Conflicts: mm/page_alloc.c Signed-off-by: Wen Zhiwei --- mm/page_alloc.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a666ff21f60..00c9050a65ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -311,7 +311,7 @@ EXPORT_SYMBOL(nr_online_nodes); static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); -static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); @@ -2983,9 +2983,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif -#ifdef CONFIG_UNACCEPTED_MEMORY - unusable_free += zone_page_state(z, NR_UNACCEPTED); -#endif return unusable_free; } @@ -3283,6 +3280,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } + cond_accept_memory(zone, order); + /* * Detect whether the number of free pages is below high * watermark. If so, we will decrease pcp->high and free @@ -3308,10 +3307,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret; - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -3365,10 +3362,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, return page; } else { - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ @@ -7084,9 +7079,6 @@ static bool try_to_accept_memory_one(struct zone *zone) struct page *page; bool last; - if (list_empty(&zone->unaccepted_pages)) - return false; - spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, struct page, lru); @@ -7112,23 +7104,29 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; - int ret = false; + bool ret = false; + + if (!has_unaccepted_memory()) + return false; + + if (list_empty(&zone->unaccepted_pages)) + return false; /* How much to accept to get to high watermark? */ to_accept = high_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - - __zone_watermark_unusable_free(zone, order, 0)); + __zone_watermark_unusable_free(zone, order, 0) - + zone_page_state(zone, NR_UNACCEPTED)); - /* Accept at least one page */ - do { + while (to_accept > 0) { if (!try_to_accept_memory_one(zone)) break; ret = true; to_accept -= MAX_ORDER_NR_PAGES; - } while (to_accept > 0); + } return ret; } @@ -7171,7 +7169,7 @@ static void accept_page(struct page *page, unsigned int order) { } -static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; } -- Gitee From 1d21ce275ce2dae9f9d63156f58e1998155cf3cc Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:05 -0400 Subject: [PATCH 2/3] mm/numa: no task_numa_fault() call if PMD is changed stable inclusion from stable-v6.6.48 commit c789a78151c113d06d84a3e8db93e156a222d6db category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAWEBV Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c789a78151c113d06d84a3e8db93e156a222d6db -------------------------------- commit fd8c35a92910f4829b7c99841f39b1b952c259d5 upstream. When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") restructured do_huge_pmd_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pmd_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Conflicts: mm/huge_memory.c Signed-off-by: Wen Zhiwei --- mm/huge_memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a10f3a6349ea..6bb149f9e617 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2006,7 +2006,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); - goto out; + return 0; } pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -2048,22 +2048,16 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) if (migrated) { flags |= TNF_MIGRATED; nid = target_nid; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { - spin_unlock(vmf->ptl); - goto out; - } - goto out_map; - } - -out: - if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; + } - return 0; - + flags |= TNF_MIGRATE_FAIL; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + return 0; + } out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); @@ -2073,7 +2067,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); + return 0; } /* -- Gitee From 3bfa35ccde9e52ca74fed047eb4d89be32dd9e5e Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 9 Aug 2024 10:59:04 -0400 Subject: [PATCH 3/3] mm/numa: no task_numa_fault() call if PTE is changed stable inclusion from stable-v6.6.48 commit 19b4397c4a15093b8f50ead50164641392f16f77 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAWEBV Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=19b4397c4a15093b8f50ead50164641392f16f77 -------------------------------- commit 40b760cfd44566bca791c80e0720d70d75382b84 upstream. When handling a numa page fault, task_numa_fault() should be called by a process that restores the page table of the faulted folio to avoid duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") restructured do_numa_page() and did not avoid task_numa_fault() call in the second page table check after a numa migration failure. Fix it by making all !pte_same() return immediately. This issue can cause task_numa_fault() being called more than necessary and lead to unexpected numa balancing results (It is hard to tell whether the issue will cause positive or negative performance impact due to duplicated numa fault counting). Link: https://lkml.kernel.org/r/20240809145906.1513458-2-ziy@nvidia.com Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") Signed-off-by: Zi Yan Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Kefeng Wang Cc: Mel Gorman Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Conflicts: mm/memory.c Signed-off-by: Wen Zhiwei --- mm/memory.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e499695ce5db..379d436c8198 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5286,7 +5286,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); @@ -5346,23 +5346,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (migrate_misplaced_folio(folio, vma, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte)) - goto out; - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; - } - goto out_map; + task_numa_fault(last_cpupid, nid, 1, flags); + return 0; } -out: - if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, nr_pages, flags); - return 0; + flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } out_map: /* * Make it present again, depending on how arch implements @@ -5375,7 +5371,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, 1, flags); + return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) -- Gitee