diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26a60ff9cfed85e2ee09119fbcec0ca023d886b6..d2dceb45746ca16cd07dfe4e46b11ad92ab103bb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -364,6 +364,10 @@ static inline void mapping_clear_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eddb7984610de624f9cb031dd265692eaecfff80..a6f08718663031422bda8231fc23fbbfa36ffa45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3169,30 +3169,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - /* Cannot split anonymous THP to order-1 */ - if (new_order == 1 && folio_test_anon(folio)) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - - if (new_order) { - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + if (new_order == 1) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; + } + } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } - /* No split if the file system does not support large folio */ - if (!mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio) && new_order) + return -EINVAL; is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d97410e3ec0e5e3a0ded231202b691cc49eb449d..fcf08f3dc53fe1abcb86a94a4ce719159c27ed07 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -8606,8 +8606,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. This is only used by the page cache - * (in replace_page_cache_folio()). + * be uncharged upon free. * * Both folios must be locked, @new->mapping must be set up. */ diff --git a/mm/memory.c b/mm/memory.c index a8f0df59aca1eb6e7d33a5e27a697332adb730f0..fe10342687d028b7cc6f55e0cf48611e0e88aabc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4609,8 +4609,9 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - if (page != &folio->page || folio_order(folio) != HPAGE_PMD_ORDER) + if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; + page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise @@ -5082,17 +5083,57 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + struct folio *folio, pte_t fault_pte, + bool ignore_writable, bool pte_write_upgrade) +{ + int nr = pte_pfn(fault_pte) - folio_pfn(folio); + unsigned long start, end, addr = vmf->address; + unsigned long addr_start = addr - (nr << PAGE_SHIFT); + unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); + pte_t *start_ptep; + + /* Stay within the VMA and within the page table. */ + start = max3(addr_start, pt_start, vma->vm_start); + end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, + vma->vm_end); + start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); + + /* Restore all PTEs' mapping of the large folio */ + for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(start_ptep); + bool writable = false; + + if (!pte_present(ptent) || !pte_protnone(ptent)) + continue; + + if (pfn_folio(pte_pfn(ptent)) != folio) + continue; + + if (!ignore_writable) { + ptent = pte_modify(ptent, vma->vm_page_prot); + writable = pte_write(ptent); + if (!writable && pte_write_upgrade && + can_change_pte_writable(vma, addr, ptent)) + writable = true; + } + + numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); + } } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -5100,11 +5141,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false; + bool writable = false, ignore_writable = false; + bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0; + int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page @@ -5126,7 +5168,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -5134,10 +5176,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* TODO: handle PTE-mapped THP */ - if (folio_test_large(folio)) - goto out_map; - /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5157,6 +5195,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_SHARED; nid = folio_nid(folio); + nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -5173,6 +5212,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; + ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5193,14 +5233,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, 1, flags); + task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - numa_rebuild_single_mapping(vmf, vma, writable); + if (folio && folio_test_large(folio)) + numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, + pte_write_upgrade); + else + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 141509ec9a485a2785f51005043c08db579cfa6e..78c5b4aaf60d355375399c8f26a7e66234eab119 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1655,7 +1655,12 @@ static int migrate_pages_batch(struct list_head *from, /* * The rare folio on the deferred split list should - * be split now. It should not count as a failure. + * be split now. It should not count as a failure: + * but increment nr_failed because, without doing so, + * migrate_pages() may report success with (split but + * unmigrated) pages still on its fromlist; whereas it + * always reports success when its fromlist is empty. + * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list @@ -1670,6 +1675,7 @@ static int migrate_pages_batch(struct list_head *from, if (nr_pages > 2 && !list_empty(&folio->_deferred_list)) { if (try_split_folio(folio, split_folios) == 0) { + nr_failed++; stats->nr_thp_split += is_thp; stats->nr_split++; continue; diff --git a/mm/mprotect.c b/mm/mprotect.c index f121c46f6e4c43aa131e72a2b37d63eff7d7c37c..b360577be4f8819637af6a13a4a8da3989d30dff 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -129,7 +129,8 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - folio_ref_count(folio) != 1) + (folio_maybe_dma_pinned(folio) || + folio_likely_mapped_shared(folio))) continue; /* diff --git a/mm/rmap.c b/mm/rmap.c index 27f8881be2ad416b5b8bbfb89052f791a38d3690..d13003244e6af1f3bff3ee5727155d660accbe95 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1473,6 +1473,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, { atomic_t *mapped = &folio->_nr_pages_mapped; int last, nr = 0, nr_pmdmapped = 0; + bool partially_mapped = false; enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1489,6 +1490,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, if (last) nr++; } while (page++, --nr_pages > 0); + + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: last = atomic_add_negative(-1, &folio->_entire_mapcount); @@ -1505,6 +1508,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = 0; } } + + partially_mapped = nr < nr_pmdmapped; break; } @@ -1525,10 +1530,12 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. */ - if (folio_test_large(folio) && folio_test_anon(folio)) - if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped) - deferred_split_folio(folio); + if (folio_test_anon(folio) && partially_mapped && + list_empty(&folio->_deferred_list)) + deferred_split_folio(folio); } /* diff --git a/mm/shmem.c b/mm/shmem.c index cf27e1785f8024b7ce5fca0026c4b8b7aab658b0..079f47192bdb4f04ee6555dcf74bad9be76cbbac 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1769,7 +1769,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); __lruvec_stat_mod_folio(new, NR_SHMEM, 1); shmem_reliable_folio_add(new, 1);