From b8d29659a1fb14f8f47fe630ca745826b1392a7a Mon Sep 17 00:00:00 2001 From: Wei Xu Date: Thu, 17 Oct 2024 18:15:28 +0000 Subject: [PATCH 1/3] mm/mglru: reset page lru tier bits when activating ANBZ: #27526 commit f1001f3d3b6868998cab73d10fda1a5c99ddf963 upstream. When a folio is activated, lru_gen_add_folio() moves the folio to the youngest generation. But unlike folio_update_gen()/folio_inc_gen(), lru_gen_add_folio() doesn't reset the folio lru tier bits (LRU_REFS_MASK | LRU_REFS_FLAGS). This inconsistency can affect how pages are aged via folio_mark_accessed() (e.g. fd accesses), though no user visible impact related to this has been detected yet. Note that lru_gen_add_folio() cannot clear PG_workingset if the activation is due to workingset refault, otherwise PSI accounting will be skipped. So fix lru_gen_add_folio() to clear the lru tier bits other than PG_workingset when activating a folio, and also clear all the lru tier bits when a folio is activated via folio_activate() in lru_gen_look_around(). Link: https://lkml.kernel.org/r/20241017181528.3358821-1-weixugc@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Wei Xu Cc: Axel Rasmussen Cc: Brian Geffon Cc: Jan Alexander Steffens Cc: Suleiman Souhlal Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- include/linux/mm_inline.h | 15 ++++++++++++++- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 8 ++++---- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 17c2e2c8100c..a14a61652267 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -157,6 +157,11 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } +static inline void folio_clear_lru_refs(struct folio *folio) +{ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); +} + static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); @@ -224,6 +229,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, { unsigned long seq; unsigned long flags; + unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -259,7 +265,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + mask = LRU_GEN_MASK; + /* + * Don't clear PG_workingset here because it can affect PSI accounting + * if the activation is due to workingset refault. + */ + if (folio_test_active(folio)) + mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); + set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e25871b60087..9fc61047ddf8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -406,6 +406,8 @@ enum { NR_LRU_GEN_CAPS }; +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) diff --git a/mm/vmscan.c b/mm/vmscan.c index 48f595ce6a89..5c1360ad92f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2654,8 +2654,6 @@ static bool should_clear_pmd_young(void) * shorthand helpers ******************************************************************************/ -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) @@ -4153,8 +4151,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); - else if (old_gen != new_gen) + else if (old_gen != new_gen) { + folio_clear_lru_refs(folio); folio_activate(folio); + } } arch_leave_lazy_mmu_mode(); @@ -4415,7 +4415,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on MAX_NR_TIERS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + folio_clear_lru_refs(folio); /* for shrink_folio_list() */ folio_clear_reclaim(folio); -- Gitee From 574662e70f88a09706a230a474998a745b514521 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:38 +0000 Subject: [PATCH 2/3] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL stats ANBZ: #27526 commit ddd6d8e975b171ea3f63a011a75820883ff0d479 upstream. Patch series "mm: multi-gen LRU: Have secondary MMUs participate in MM_WALK". Today, the MM_WALK capability causes MGLRU to clear the young bit from PMDs and PTEs during the page table walk before eviction, but MGLRU does not call the clear_young() MMU notifier in this case. By not calling this notifier, the MM walk takes less time/CPU, but it causes pages that are accessed mostly through KVM / secondary MMUs to appear younger than they should be. We do call the clear_young() notifier today, but only when attempting to evict the page, so we end up clearing young/accessed information less frequently for secondary MMUs than for mm PTEs, and therefore they appear younger and are less likely to be evicted. Therefore, memory that is *not* being accessed mostly by KVM will be evicted *more* frequently, worsening performance. ChromeOS observed a tab-open latency regression when enabling MGLRU with a setup that involved running a VM: Tab-open latency histogram (ms) Version p50 mean p95 p99 max base 1315 1198 2347 3454 10319 mglru 2559 1311 7399 12060 43758 fix 1119 926 2470 4211 6947 This series replaces the final non-selftest patchs from this series[1], which introduced a similar change (and a new MMU notifier) with KVM optimizations. I'll send a separate series (to Sean and Paolo) for the KVM optimizations. This series also makes proactive reclaim with MGLRU possible for KVM memory. I have verified that this functions correctly with the selftest from [1], but given that that test is a KVM selftest, I'll send it with the rest of the KVM optimizations later. Andrew, let me know if you'd like to take the test now anyway. [1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google.com/ This patch (of 2): The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful and become more complicated to properly compute when adding test/clear_young() notifiers in MGLRU's mm walk. Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: David Stevens Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- include/linux/mmzone.h | 2 -- mm/vmscan.c | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9fc61047ddf8..650aef3df2bc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -465,9 +465,7 @@ struct lru_gen_folio { enum { MM_LEAF_TOTAL, /* total leaf entries */ - MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ - MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c1360ad92f7..e904a7a17c32 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3431,7 +3431,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, continue; if (!pte_young(ptent)) { - walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -3590,7 +3589,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, walk->mm_stats[MM_LEAF_TOTAL]++; if (!pmd_young(val)) { - walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -3602,7 +3600,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, continue; } #endif - walk->mm_stats[MM_NONLEAF_TOTAL]++; if (should_clear_pmd_young()) { if (!pmd_young(val)) @@ -5352,11 +5349,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { - const char *s = " "; + const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { - s = "RT "; + s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { @@ -5375,14 +5372,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { - const char *s = " "; + const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { - s = "LOYNFA"; + s = "TYFA"; n = READ_ONCE(lruvec->mm_state.stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { - s = "loynfa"; + s = "tyfa"; n = READ_ONCE(lruvec->mm_state.stats[hist][i]); } -- Gitee From 3452cbd55c2e16563292edb757abdad619230abe Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 19 Oct 2024 01:29:39 +0000 Subject: [PATCH 3/3] mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify() ANBZ: #27526 commit 1d4832becdc2cdb2cffe2a6050c9d9fd8ff1c58c upstream. When the MM_WALK capability is enabled, memory that is mostly accessed by a VM appears younger than it really is, therefore this memory will be less likely to be evicted. Therefore, the presence of a running VM can significantly increase swap-outs for non-VM memory, regressing the performance for the rest of the system. Fix this regression by always calling {ptep,pmdp}_clear_young_notify() whenever we clear the young bits on PMDs/PTEs. [jthoughton@google.com: fix link-time error] Link: https://lkml.kernel.org/r/20241019012940.3656292-3-jthoughton@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Signed-off-by: James Houghton Reported-by: David Stevens Cc: Axel Rasmussen Cc: David Matlack Cc: David Rientjes Cc: Oliver Upton Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Wei Xu Cc: Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- include/linux/mmzone.h | 5 ++- mm/rmap.c | 9 ++--- mm/vmscan.c | 86 +++++++++++++++++++++++------------------- 3 files changed, 53 insertions(+), 47 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 650aef3df2bc..3cade7475a52 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -505,7 +505,7 @@ struct lru_gen_mm_walk { }; void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG @@ -597,8 +597,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { + return false; } #ifdef CONFIG_MEMCG diff --git a/mm/rmap.c b/mm/rmap.c index 911b474aa29f..ff6b58031a82 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -837,13 +837,10 @@ static bool folio_referenced_one(struct folio *folio, return false; /* To break the loop */ } - if (pvmw.pte) { - if (lru_gen_enabled() && - pte_young(ptep_get(pvmw.pte))) { - lru_gen_look_around(&pvmw); + if (lru_gen_enabled() && pvmw.pte) { + if (lru_gen_look_around(&pvmw)) referenced++; - } - + } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; diff --git a/mm/vmscan.c b/mm/vmscan.c index e904a7a17c32..f5e10dbae697 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -61,6 +61,7 @@ #include #endif #include +#include #include #include @@ -3325,7 +3326,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk return false; } -static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); @@ -3337,14 +3339,21 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) return -1; + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); @@ -3356,9 +3365,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pmd_devmap(pmd))) return -1; + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } #endif @@ -3368,10 +3383,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, { struct folio *folio; - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3426,20 +3437,16 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(ptent, args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) { - continue; - } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + continue; young++; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3505,21 +3512,24 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) + if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { - if (should_clear_pmd_young()) + if (should_clear_pmd_young() && !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) goto next; - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3583,25 +3593,18 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(val)) { - unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; - if (!pmd_young(val)) { - continue; - } - - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } #endif - if (should_clear_pmd_young()) { + if (should_clear_pmd_young() && !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; @@ -4059,13 +4062,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; - int young = 0; + int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; @@ -4080,12 +4083,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + if (spin_is_contended(pvmw->ptl)) - return; + return true; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) - return; + return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; @@ -4093,6 +4099,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + if (end - start == PAGE_SIZE) + return true; + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; @@ -4106,7 +4115,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) - return; + return true; arch_enter_lazy_mmu_mode(); @@ -4116,19 +4125,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long pfn; pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(ptent, vma, addr); + pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) - continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; young++; @@ -4160,6 +4166,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); + + return true; } /****************************************************************************** -- Gitee