diff --git a/mm/internal.h b/mm/internal.h index a266a08e0831e1df82aa4ec539ae1c84b2305181..d02db3e9eda902ca59ce1be942bf07a175029f86 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -592,6 +592,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); + +/* + * NOTE: This function can't tell whether the folio is "fully mapped" in the + * range. + * "fully mapped" means all the pages of folio is associated with the page + * table of range while this function just check whether the folio range is + * within the range [start, end). Funcation caller nees to do page table + * check if it cares about the page table association. + * + * Typical usage (like mlock or madvise) is: + * Caller knows at least 1 page of folio is associated with page table of VMA + * and the range [start, end) is intersect with the VMA range. Caller wants + * to know whether the folio is fully associated with the range. It calls + * this function to check whether the folio is in the range first. Then checks + * the page table to know whether the folio is fully mapped to the range. + */ +static inline bool +folio_within_range(struct folio *folio, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + pgoff_t pgoff, addr; + unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); + if (start > end) + return false; + + if (start < vma->vm_start) + start = vma->vm_start; + + if (end > vma->vm_end) + end = vma->vm_end; + + pgoff = folio_pgoff(folio); + + /* if folio start address is not in vma range */ + if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) + return false; + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + + return !(addr < start || end - addr < folio_size(folio)); +} + +static inline bool +folio_within_vma(struct folio *folio, struct vm_area_struct *vma) +{ + return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); +} + /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, @@ -600,14 +650,10 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, * mlock is usually called at the end of page_add_*_rmap(), munlock at * the end of page_remove_rmap(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). - * - * @compound is used to include pmd mappings of THPs, but filter out - * pte mappings of THPs, which cannot be consistently counted: a pte - * mapping of the THP head cannot be distinguished by the page alone. */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. @@ -617,17 +663,24 @@ static inline void mlock_vma_folio(struct folio *folio, * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ - if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && - (compound || !folio_test_large(folio))) + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { - if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !folio_test_large(folio))) + /* + * munlock if the function is called. Ideally, we should only + * do munlock if any page of folio is unmapped from VMA and + * cause folio not fully mapped to VMA. + * + * But it's not easy to confirm that's the situation. So we + * always munlock the folio and page reclaim will correct it + * if it's wrong. + */ + if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } diff --git a/mm/mlock.c b/mm/mlock.c index 06bdfab83b58af92f0abd43d068567de52d7f57d..f79d8262c1a0f2ddac75f17c607c17e40111ed9b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -305,6 +305,62 @@ void munlock_folio(struct folio *folio) local_unlock(&mlock_fbatch.lock); } +static inline unsigned int folio_mlock_step(struct folio *folio, + pte_t *pte, unsigned long addr, unsigned long end) +{ + unsigned int count, i, nr = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + pte_t ptent = ptep_get(pte); + + if (!folio_test_large(folio)) + return 1; + + count = pfn + nr - pte_pfn(ptent); + count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT); + + for (i = 0; i < count; i++, pte++) { + pte_t entry = ptep_get(pte); + + if (!pte_present(entry)) + break; + if (pte_pfn(entry) - pfn >= nr) + break; + } + + return i; +} + +static inline bool allow_mlock_munlock(struct folio *folio, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned int step) +{ + /* + * For unlock, allow munlock large folio which is partially + * mapped to VMA. As it's possible that large folio is + * mlocked and VMA is split later. + * + * During memory pressure, such kind of large folio can + * be split. And the pages are not in VM_LOCKed VMA + * can be reclaimed. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return true; + + /* folio_within_range() cannot take KSM, but any small folio is OK */ + if (!folio_test_large(folio)) + return true; + + /* folio not in range [start, end), skip mlock */ + if (!folio_within_range(folio, vma, start, end)) + return false; + + /* folio is not fully mapped, skip mlock */ + if (step != folio_nr_pages(folio)) + return false; + + return true; +} + static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -314,6 +370,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; + unsigned int step = 1; + unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -334,6 +392,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) @@ -341,12 +400,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; - if (folio_test_large(folio)) - continue; + + step = folio_mlock_step(folio, pte, addr, end); + if (!allow_mlock_munlock(folio, vma, start, end, step)) + goto next_entry; + if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); + +next_entry: + pte += step - 1; + addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: diff --git a/mm/rmap.c b/mm/rmap.c index 93ea81fe51800e494861d1c7cc41ed521faf98d1..770f5d25046a199c949502f008d14ca39aa456c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -798,6 +798,7 @@ struct folio_referenced_arg { unsigned long vm_flags; struct mem_cgroup *memcg; }; + /* * arg: folio_referenced_arg will be passed */ @@ -807,17 +808,33 @@ static bool folio_referenced_one(struct folio *folio, struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; + unsigned long start = address, ptes = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if ((vma->vm_flags & VM_LOCKED) && - (!folio_test_large(folio) || !pvmw.pte)) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, !pvmw.pte); - page_vma_mapped_walk_done(&pvmw); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ + if (vma->vm_flags & VM_LOCKED) { + if (!folio_test_large(folio) || !pvmw.pte) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + /* + * For large folio fully mapped to VMA, will + * be handled after the pvmw loop. + * + * For large folio cross VMA boundaries, it's + * expected to be picked by page reclaim. But + * should skip reference of pages which are in + * the range of VM_LOCKED vma. As page reclaim + * should just count the reference of pages out + * the range of VM_LOCKED vma. + */ + ptes++; + pra->mapcount--; + continue; } if (pvmw.pte) { @@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio, pra->mapcount--; } + if ((vma->vm_flags & VM_LOCKED) && + folio_test_large(folio) && + folio_within_vma(folio, vma)) { + unsigned long s_align, e_align; + + s_align = ALIGN_DOWN(start, PMD_SIZE); + e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); + + /* folio doesn't cross page table boundary and fully mapped */ + if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + } + if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) @@ -1253,7 +1287,14 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __page_check_anon_rmap(folio, page, vma, address); } - mlock_vma_folio(folio, vma, compound); + /* + * For large folio, only mlock it if it's fully mapped to VMA. It's + * not easy to check whether the large folio is fully mapped to VMA + * here. Only mlock normal 4K folio and leave page reclaim to handle + * large folio. + */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1352,7 +1393,9 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_folio(folio, vma, compound); + /* See comments in page_add_anon_rmap() */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1463,7 +1506,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, * it's only reliable while mapped. */ - munlock_vma_folio(folio, vma, compound); + munlock_vma_folio(folio, vma); } /* @@ -1528,7 +1571,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, false); + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); ret = false; break;