diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b72196ed5a76ffb7c3cb65251043495c3f4ac845..07948fe59b9d98fb7051818178d583c2d17d3fca 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -45,12 +45,6 @@ __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool arch_thp_swp_supported(void) -{ - return !system_supports_mte(); -} -#define arch_thp_swp_supported arch_thp_swp_supported - /* * Outside of a few very special situations (e.g. hibernation), we always * use broadcast TLB invalidation instructions, therefore a spurious page @@ -1095,12 +1089,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #ifdef CONFIG_ARM64_MTE #define __HAVE_ARCH_PREPARE_TO_SWAP -static inline int arch_prepare_to_swap(struct page *page) -{ - if (system_supports_mte()) - return mte_save_tags(page); - return 0; -} +extern int arch_prepare_to_swap(struct folio *folio); #define __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) @@ -1116,11 +1105,7 @@ static inline void arch_swap_invalidate_area(int type) } #define __HAVE_ARCH_SWAP_RESTORE -static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) -{ - if (system_supports_mte()) - mte_restore_tags(entry, &folio->page); -} +extern void arch_swap_restore(swp_entry_t entry, struct folio *folio); #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 16788f07716d58a1b92221c1ce781c77d025bc33..1b64b4c3f8bf8af49d53d1d835ad9eaa3dda5407 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -135,7 +135,7 @@ void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte = pte_mkcont(pte); contpte_convert(mm, addr, orig_ptep, pte); } -EXPORT_SYMBOL(__contpte_try_fold); +EXPORT_SYMBOL_GPL(__contpte_try_fold); void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) @@ -150,7 +150,7 @@ void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte = pte_mknoncont(pte); contpte_convert(mm, addr, ptep, pte); } -EXPORT_SYMBOL(__contpte_try_unfold); +EXPORT_SYMBOL_GPL(__contpte_try_unfold); pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) { @@ -178,21 +178,25 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) return orig_pte; } -EXPORT_SYMBOL(contpte_ptep_get); +EXPORT_SYMBOL_GPL(contpte_ptep_get); pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* - * Gather access/dirty bits, which may be populated in any of the ptes - * of the contig range. We may not be holding the PTL, so any contiguous - * range may be unfolded/modified/refolded under our feet. Therefore we - * ensure we read a _consistent_ contpte range by checking that all ptes - * in the range are valid and have CONT_PTE set, that all pfns are - * contiguous and that all pgprots are the same (ignoring access/dirty). - * If we find a pte that is not consistent, then we must be racing with - * an update so start again. If the target pte does not have CONT_PTE - * set then that is considered consistent on its own because it is not - * part of a contpte range. + * The ptep_get_lockless() API requires us to read and return *orig_ptep + * so that it is self-consistent, without the PTL held, so we may be + * racing with other threads modifying the pte. Usually a READ_ONCE() + * would suffice, but for the contpte case, we also need to gather the + * access and dirty bits from across all ptes in the contiguous block, + * and we can't read all of those neighbouring ptes atomically, so any + * contiguous range may be unfolded/modified/refolded under our feet. + * Therefore we ensure we read a _consistent_ contpte range by checking + * that all ptes in the range are valid and have CONT_PTE set, that all + * pfns are contiguous and that all pgprots are the same (ignoring + * access/dirty). If we find a pte that is not consistent, then we must + * be racing with an update so start again. If the target pte does not + * have CONT_PTE set then that is considered consistent on its own + * because it is not part of a contpte range. */ pgprot_t orig_prot; @@ -231,7 +235,7 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) return orig_pte; } -EXPORT_SYMBOL(contpte_ptep_get_lockless); +EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) @@ -274,7 +278,7 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, } while (addr != end); } -EXPORT_SYMBOL(contpte_set_ptes); +EXPORT_SYMBOL_GPL(contpte_set_ptes); void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) @@ -282,7 +286,7 @@ void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, contpte_try_unfold_partial(mm, addr, ptep, nr); __clear_full_ptes(mm, addr, ptep, nr, full); } -EXPORT_SYMBOL(contpte_clear_full_ptes); +EXPORT_SYMBOL_GPL(contpte_clear_full_ptes); pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, @@ -291,7 +295,7 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, contpte_try_unfold_partial(mm, addr, ptep, nr); return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); } -EXPORT_SYMBOL(contpte_get_and_clear_full_ptes); +EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -316,7 +320,7 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, return young; } -EXPORT_SYMBOL(contpte_ptep_test_and_clear_young); +EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -337,7 +341,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, return young; } -EXPORT_SYMBOL(contpte_ptep_clear_flush_young); +EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) @@ -355,7 +359,7 @@ void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, contpte_try_unfold_partial(mm, addr, ptep, nr); __wrprotect_ptes(mm, addr, ptep, nr); } -EXPORT_SYMBOL(contpte_wrprotect_ptes); +EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -401,4 +405,4 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -EXPORT_SYMBOL(contpte_ptep_set_access_flags); +EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags); diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index a31833e3ddc544c88abe1ef4a8ef6d292cd2bd8d..63e8d72f202a3b2848e4cf1e57d424e30d65f768 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -68,6 +68,13 @@ void mte_invalidate_tags(int type, pgoff_t offset) mte_free_tag_storage(tags); } +static inline void __mte_invalidate_tags(struct page *page) +{ + swp_entry_t entry = page_swap_entry(page); + + mte_invalidate_tags(swp_type(entry), swp_offset(entry)); +} + void mte_invalidate_tags_area(int type) { swp_entry_t entry = swp_entry(type, 0); @@ -83,3 +90,41 @@ void mte_invalidate_tags_area(int type) } xa_unlock(&mte_pages); } + +int arch_prepare_to_swap(struct folio *folio) +{ + long i, nr; + int err; + + if (!system_supports_mte()) + return 0; + + nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + err = mte_save_tags(folio_page(folio, i)); + if (err) + goto out; + } + return 0; + +out: + while (i--) + __mte_invalidate_tags(folio_page(folio, i)); + return err; +} + +void arch_swap_restore(swp_entry_t entry, struct folio *folio) +{ + long i, nr; + + if (!system_supports_mte()) + return; + + nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + mte_restore_tags(entry, folio_page(folio, i)); + entry.val++; + } +} diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index b4a35da9ac3d1851c782e72272150830790b83d8..20ac09d67d332185767dccef5025342136b158df 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -72,7 +72,7 @@ static ssize_t swap_pages_write(struct file *file, const char __user *buf, } if (!list_empty(&pagelist)) - reclaim_pages(&pagelist); + reclaim_pages(&pagelist, false); ret = count; kfree(data_ptr_res); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 885888e38e26954f6a8acf8522551cdecafbe2c4..abf2340a2d18a646fa37c14c86d63a9fb74b1d71 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -266,10 +266,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); -int split_huge_page_to_list(struct page *page, struct list_head *list); +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list(page, NULL); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); @@ -423,7 +424,8 @@ can_split_folio(struct folio *folio, int *pextra_pins) return false; } static inline int -split_huge_page_to_list(struct page *page, struct list_head *list) +split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { return 0; } @@ -520,27 +522,18 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int split_folio_to_list(struct folio *folio, - struct list_head *list) +static inline int split_folio_to_list_to_order(struct folio *folio, + struct list_head *list, int new_order) { - return split_huge_page_to_list(&folio->page, list); + return split_huge_page_to_list_to_order(&folio->page, list, new_order); } -static inline int split_folio(struct folio *folio) +static inline int split_folio_to_order(struct folio *folio, int new_order) { - return split_folio_to_list(folio, NULL); + return split_folio_to_list_to_order(folio, NULL, new_order); } -/* - * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to - * limitations in the implementation like arm64 MTE can override this to - * false - */ -#ifndef arch_thp_swp_supported -static inline bool arch_thp_swp_supported(void) -{ - return true; -} -#endif +#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) +#define split_folio(f) split_folio_to_order(f, 0) #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 53adaf4e39be9c2e066c8b6868404d645e3fcf96..8c199fe368c2e7849e061d2872173161cdce0232 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1278,7 +1278,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, unsigned int nr); +void split_page_memcg(struct page *head, int old_order, int new_order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1742,7 +1742,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, unsigned int nr) +static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 83519e1cfc956013731f74f51e13d23393c741c4..f86fd573a4a144596535673eaf694fbd4bd8a478 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2160,21 +2160,49 @@ static inline size_t folio_size(struct folio *folio) } /** - * folio_estimated_sharers - Estimate the number of sharers of a folio. + * folio_likely_mapped_shared - Estimate if the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * folio_estimated_sharers() aims to serve as a function to efficiently - * estimate the number of processes sharing a folio. This is done by - * looking at the precise mapcount of the first subpage in the folio, and - * assuming the other subpages are the same. This may not be true for large - * folios. If you want exact mapcounts for exact calculations, look at - * page_mapcount() or folio_total_mapcount(). + * This function checks if the folio is currently mapped into more than one + * MM ("mapped shared"), or if the folio is only mapped into a single MM + * ("mapped exclusively"). * - * Return: The estimated number of processes sharing a folio. + * As precise information is not easily available for all folios, this function + * estimates the number of MMs ("sharers") that are currently mapping a folio + * using the number of times the first page of the folio is currently mapped + * into page tables. + * + * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, + * the return value will be exactly correct, because they can only be mapped + * at most once into an MM, and they cannot be partially mapped. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped exclusively" (false negative) when the folio is + * only partially mapped into at least one MM. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * #. For (small) KSM folios, the return value can wrongly indicate "mapped + * shared" (false negative), when the folio is mapped multiple times into + * the same MM. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline int folio_estimated_sharers(struct folio *folio) +static inline bool folio_likely_mapped_shared(struct folio *folio) { - return page_mapcount(folio_page(folio, 0)); + return page_mapcount(folio_page(folio, 0)) > 1; } #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 119a0c9d2a8b50a97db218d880d4ac869c08de7d..debdc25f08b93559600e5e0b9e4afd5161ad365b 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int nr); +extern void __split_page_owner(struct page *page, int old_order, + int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int nr) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, nr); + __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -56,11 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, - unsigned short order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 87cd26a480383619545c579f09b4c410d490bfac..ecc561d49d5b000219609454c1f969df9ceaec24 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -333,6 +333,36 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, } #endif +#ifndef mkold_ptes +/** + * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old. + * @vma: VMA the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to mark old. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_test_and_clear_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_test_and_clear_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, @@ -680,6 +710,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm, } #endif +#ifndef clear_not_present_full_ptes +/** + * clear_not_present_full_ptes - Clear multiple not present PTEs which are + * consecutive in the pgtable. + * @mm: Address space the ptes represent. + * @addr: Address of the first pte. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over pte_clear_not_present_full(). + * + * Context: The caller holds the page table lock. The PTEs are all not present. + * The PTEs are all in the same PMD. + */ +static inline void clear_not_present_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + pte_clear_not_present_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, @@ -1024,7 +1083,7 @@ static inline int arch_unmap_one(struct mm_struct *mm, * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP -static inline int arch_prepare_to_swap(struct page *page) +static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } diff --git a/include/linux/swap.h b/include/linux/swap.h index e818f53cbc31e6a98c8198a99ed3ac6f77c92e15..13cd68b5f5e26e76eba0c01c3d9373cecfe8b944 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -271,7 +271,20 @@ struct swap_cluster_info { }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ -#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ + +/* + * The first page in the swap file is the swap header, which is always marked + * bad to prevent it from being allocated as an entry. This also prevents the + * cluster to which it belongs being marked free. Therefore 0 is safe to use as + * a sentinel to indicate next is not valid in percpu_cluster. + */ +#define SWAP_NEXT_INVALID 0 + +#ifdef CONFIG_THP_SWAP +#define SWAP_NR_ORDERS (PMD_ORDER + 1) +#else +#define SWAP_NR_ORDERS 1 +#endif /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from @@ -279,8 +292,7 @@ struct swap_cluster_info { * throughput. */ struct percpu_cluster { - struct swap_cluster_info index; /* Current cluster index */ - unsigned int next; /* Likely next allocation offset */ + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; struct swap_cluster_list { @@ -420,8 +432,9 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat); -extern unsigned long reclaim_pages(struct list_head *folio_list); + struct pglist_data *pgdat, + bool ignore_references); +extern unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) @@ -494,7 +507,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size, +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order, int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); @@ -502,7 +515,7 @@ extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); -extern int free_swap_and_cache(swp_entry_t); +extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -560,8 +573,9 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ -#define free_swap_and_cache(e) is_pfn_swap_entry(e) +static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) +{ +} static inline void free_swap_cache(struct page *page) { @@ -629,14 +643,10 @@ static inline int add_swap_extent(struct swap_info_struct *sis, } #endif /* CONFIG_SWAP */ -#ifdef CONFIG_THP_SWAP -extern int split_swap_cluster(swp_entry_t entry); -#else -static inline int split_swap_cluster(swp_entry_t entry) +static inline void free_swap_and_cache(swp_entry_t entry) { - return 0; + free_swap_and_cache_nr(entry, 1); } -#endif #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 542926da61a3ed68a4635031d6543f33c9187ffc..5c6953c4822f63ba3467f83fbe432bdb46048a60 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1555,9 +1555,11 @@ static void check_split_1(struct xarray *xa, unsigned long index, unsigned int order, unsigned int new_order) { XA_STATE_ORDER(xas, xa, index, new_order); - unsigned int i; + unsigned int i, found; + void *entry; xa_store_order(xa, index, order, xa, GFP_KERNEL); + xa_set_mark(xa, index, XA_MARK_1); xas_split_alloc(&xas, xa, order, GFP_KERNEL); xas_lock(&xas); @@ -1574,6 +1576,16 @@ static void check_split_1(struct xarray *xa, unsigned long index, xa_set_mark(xa, index, XA_MARK_0); XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + xas_set_order(&xas, index, 0); + found = 0; + rcu_read_lock(); + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_1) { + found++; + XA_BUG_ON(xa, xa_is_internal(entry)); + } + rcu_read_unlock(); + XA_BUG_ON(xa, found != 1 << (order - new_order)); + xa_destroy(xa); } diff --git a/lib/xarray.c b/lib/xarray.c index 1c87d871cacfa0c99eb0f512324bd365c30c0666..32d4bac8c94ca13e11f350c6bcfcacc2040d0359 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -970,8 +970,22 @@ static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) return marks; } +static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, + xa_mark_t mark) +{ + int i; + + if (sibs == 0) + node_mark_all(node, mark); + else { + for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) + node_set_mark(node, i, mark); + } +} + static void node_set_marks(struct xa_node *node, unsigned int offset, - struct xa_node *child, unsigned int marks) + struct xa_node *child, unsigned int sibs, + unsigned int marks) { xa_mark_t mark = XA_MARK_0; @@ -979,7 +993,7 @@ static void node_set_marks(struct xa_node *node, unsigned int offset, if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) - node_mark_all(child, mark); + node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; @@ -1078,7 +1092,8 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); - node_set_marks(node, offset, child, marks); + node_set_marks(node, offset, child, xas->xa_sibs, + marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) @@ -1087,7 +1102,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) } else { unsigned int canon = offset - xas->xa_sibs; - node_set_marks(node, canon, NULL, marks); + node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 909db25efb35ee316819048ef9252b21a9989304..21d31580d1a4fd89484d48d96d12730958aea52e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -250,7 +250,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) put_folio: folio_put(folio); } - applied = reclaim_pages(&folio_list); + applied = reclaim_pages(&folio_list, false); cond_resched(); return applied * PAGE_SIZE; } diff --git a/mm/etmem.c b/mm/etmem.c index 5accf8e0bbdffaa3c41e3aec35a1e4cb62febb1e..a1b2db374fdbe5709740a632ea6b7cc5644ecef2 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -248,7 +248,7 @@ int do_swapcache_reclaim(unsigned long *swapcache_watermark, /* Reclaim all the swapcache we have scanned */ for_each_node_state(nid, N_MEMORY) { cond_resched(); - reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid)); + reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid), false); } /* Put pack all the pages that are not reclaimed by shrink_folio_list */ diff --git a/mm/filemap.c b/mm/filemap.c index 058d79840bc7bf38072d71d52e5a3357ac741dce..a274d2c5e232f97671a78af6135737675546e33b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1940,8 +1940,6 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); @@ -3529,7 +3527,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - (*mmap_miss)++; + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3580,7 +3586,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) return ret; - (*mmap_miss)++; + /* See comment of filemap_map_folio_range() */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6c277b55544ca8dd30732960f88207580faa1c5b..0c61e7c7c2c13cc881455463dbdf35e0711f90e3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -792,8 +792,10 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) void folio_prep_large_rmappable(struct folio *folio) { - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - INIT_LIST_HEAD(&folio->_deferred_list); + if (!folio || !folio_test_large(folio)) + return; + if (folio_order(folio) > 1) + INIT_LIST_HEAD(&folio->_deferred_list); folio_set_large_rmappable(folio); } @@ -1831,7 +1833,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) @@ -2594,11 +2596,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { - enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC | TTU_BATCH_FLUSH; + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | + TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (folio_test_pmd_mappable(folio)) + ttu_flags |= TTU_SPLIT_HUGE_PMD; + /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. @@ -2632,7 +2637,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); lockdep_assert_held(&lruvec->lru_lock); @@ -2653,7 +2657,8 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } static void __split_huge_page_tail(struct folio *folio, int tail, - struct lruvec *lruvec, struct list_head *list) + struct lruvec *lruvec, struct list_head *list, + unsigned int new_order) { struct page *head = &folio->page; struct page *page_tail = head + tail; @@ -2723,15 +2728,20 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * which needs correct compound_head(). */ clear_compound_head(page_tail); + if (new_order) { + prep_compound_page(page_tail, new_order); + folio_prep_large_rmappable(new_folio); + } /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || - PageSwapCache(head))); + page_ref_unfreeze(page_tail, + 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? + folio_nr_pages(new_folio) : 0)); - if (page_is_young(head)) - set_page_young(page_tail); - if (page_is_idle(head)) - set_page_idle(page_tail); + if (folio_test_young(folio)) + folio_set_young(new_folio); + if (folio_test_idle(folio)) + folio_set_idle(new_folio); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); @@ -2744,18 +2754,20 @@ static void __split_huge_page_tail(struct folio *folio, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end) + pgoff_t end, unsigned int new_order) { struct folio *folio = page_folio(page); struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; - unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; + unsigned int new_nr = 1 << new_order; + int order = folio_order(folio); + unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, nr); + split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); @@ -2768,8 +2780,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); - for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(folio, i, lruvec, list); + for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + __split_huge_page_tail(folio, i, lruvec, list, new_order); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); @@ -2790,24 +2802,30 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } - ClearPageCompound(head); + if (!new_order) + ClearPageCompound(head); + else { + struct folio *new_folio = (struct folio *)head; + + folio_set_order(new_folio, new_order); + } unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, nr); + split_page_owner(head, order, new_order); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ if (PageSwapCache(head)) { - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { page_ref_inc(head); } } else { /* Additional pin to page cache */ - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&head->mapping->i_pages); } local_irq_enable(); @@ -2816,10 +2834,15 @@ static void __split_huge_page(struct page *page, struct list_head *list, shmem_uncharge(head->mapping->host, nr_dropped); remap_page(folio, nr); - if (folio_test_swapcache(folio)) - split_swap_cluster(folio->swap); + /* + * set page to its compound_head when split to non order-0 pages, so + * we can skip unlocking it below, since PG_locked is transferred to + * the compound_head of the page and the caller will unlock it. + */ + if (new_order) + page = compound_head(page); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; if (subpage == page) continue; @@ -2853,29 +2876,48 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) } /* - * This function splits huge page into normal pages. @page can point to any - * subpage of huge page to split. Split doesn't change the position of @page. + * This function splits a large folio into smaller folios of order @new_order. + * @page can point to any page of the large folio to split. The split operation + * does not change the position of @page. + * + * Prerequisites: * - * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. - * The huge page must be locked. + * 1) The caller must hold a reference on the @page's owning folio, also known + * as the large folio. + * + * 2) The large folio must be locked. + * + * 3) The folio must not be pinned. Any unexpected folio references, including + * GUP pins, will result in the folio not getting split; instead, the caller + * will receive an -EBUSY. + * + * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not + * supported for non-file-backed folios, because folio->_deferred_list, which + * is used by partially mapped folios, is stored in subpage 2, but an order-1 + * folio only has subpages 0 and 1. File-backed order-1 folios are supported, + * since they do not use _deferred_list. + * + * After splitting, the caller's folio reference will be transferred to @page, + * resulting in a raised refcount of @page after this call. The other pages may + * be freed if they are not mapped. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * - * Both head page and tail pages will inherit mapping, flags, and so on from - * the hugepage. + * Pages in @new_order will inherit the mapping, flags, and so on from the + * huge page. * - * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if - * they are not mapped. + * Returns 0 if the huge page was split successfully. * - * Returns 0 if the hugepage is split successfully. - * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under - * us. + * Returns -EBUSY if @page's folio is pinned, or if the anon_vma disappeared + * from under us. */ -int split_huge_page_to_list(struct page *page, struct list_head *list) +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); - XA_STATE(xas, &folio->mapping->i_pages, folio->index); + /* reset xarray order to new order after split */ + XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2885,6 +2927,34 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (new_order >= folio_order(folio)) + return -EINVAL; + + /* Cannot split anonymous THP to order-1 */ + if (new_order == 1 && folio_test_anon(folio)) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); + return -EINVAL; + } + + if (new_order) { + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio)) + return -EINVAL; + /* Split shmem folio to non-zero order not supported */ + if (shmem_mapping(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split shmem folio to non-0 order"); + return -EINVAL; + } + /* No split if the file system does not support large folio */ + if (!mapping_large_folio_support(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split file folio to non-0 order"); + return -EINVAL; + } + } + + is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); @@ -2978,16 +3048,24 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { + if (folio_order(folio) > 1 && + !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_pmd_mappable(folio)) { + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); @@ -2999,7 +3077,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } } - __split_huge_page(page, list, end); + __split_huge_page(page, list, end, new_order); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); @@ -3029,6 +3107,9 @@ void folio_undo_large_rmappable(struct folio *folio) struct deferred_split *ds_queue; unsigned long flags; + if (folio_order(folio) <= 1) + return; + /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe @@ -3054,7 +3135,12 @@ void deferred_split_folio(struct folio *folio) #endif unsigned long flags; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + /* + * Order 1 folios have no space for a deferred list, but we also + * won't waste much memory by not adding them to the deferred list. + */ + if (folio_order(folio) <= 1) + return; /* * The try_to_unmap() in page reclaim path might reach here too, @@ -3220,7 +3306,7 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, - unsigned long vaddr_end) + unsigned long vaddr_end, unsigned int new_order) { int ret = 0; struct task_struct *task; @@ -3284,13 +3370,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto next; total++; - if (!can_split_folio(folio, NULL)) + /* + * For folios with private, split_huge_page_to_list_to_order() + * will try to drop it before split and then check if the folio + * can be split or not. So skip the check here. + */ + if (!folio_test_private(folio) && + !can_split_folio(folio, NULL)) goto next; if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3308,7 +3400,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, - pgoff_t off_end) + pgoff_t off_end, unsigned int new_order) { struct filename *file; struct file *candidate; @@ -3347,7 +3439,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3372,10 +3464,14 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + /* + * hold pid, start_vaddr, end_vaddr, new_order or + * file_path, off_start, off_end, new_order + */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; + unsigned int new_order = 0; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) @@ -3404,29 +3500,29 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; } - ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); - if (ret != 2) { + ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; } - ret = split_huge_pages_in_file(file_path, off_start, off_end); + ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); if (!ret) ret = input_len; goto out; } - ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; - } else if (ret != 3) { + } else if (ret != 3 && ret != 4) { ret = -EINVAL; goto out; } - ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); if (!ret) ret = strlen(input_buf); out: diff --git a/mm/internal.h b/mm/internal.h index 28085201b863d66c994eba60734621d576d7d0aa..6451747b7160566367dd9a10c218f40ca041bc07 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include struct folio_batch; @@ -76,6 +78,20 @@ static inline int folio_nr_pages_mapped(struct folio *folio) return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } +/* + * Retrieve the first entry of a folio based on a provided entry within the + * folio. We cannot rely on folio->swap as there is no guarantee that it has + * been initialized. Used for calling arch_swap_restore() + */ +static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio) +{ + swp_entry_t swap = { + .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), + }; + + return swap; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -113,6 +129,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. + * @any_young: Optional pointer to indicate whether any entry except the + * first one is young. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. @@ -128,16 +146,18 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) + bool *any_writable, bool *any_young) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; - bool writable; + bool writable, young; int nr; if (any_writable) *any_writable = false; + if (any_young) + *any_young = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); @@ -151,6 +171,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) @@ -166,6 +188,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_writable) *any_writable |= writable; + if (any_young) + *any_young |= young; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); @@ -174,6 +198,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, return min(ptep - start_ptep, max_nr); } + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + swp_entry_t entry = pte_to_swp_entry(pte); + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), + (swp_offset(entry) + 1))); + + if (pte_swp_soft_dirty(pte)) + new = pte_swp_mksoft_dirty(new); + if (pte_swp_exclusive(pte)) + new = pte_swp_mkexclusive(new); + if (pte_swp_uffd_wp(pte)) + new = pte_swp_mkuffd_wp(new); + + return new; +} + +/** + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries + * @start_ptep: Page table pointer for the first entry. + * @max_nr: The maximum number of table entries to consider. + * @pte: Page table entry for the first entry. + * + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs + * containing swap entries all with consecutive offsets and targeting the same + * swap type, all with matching swp pte bits. + * + * max_nr must be at least one and must be limited by the caller so scanning + * cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) +{ + pte_t expected_pte = pte_next_swp_offset(pte); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON(max_nr < 1); + VM_WARN_ON(!is_swap_pte(pte)); + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + expected_pte = pte_next_swp_offset(expected_pte); + ptep++; + } + + return ptep - start_ptep; +} #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -512,8 +598,7 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - if (folio && folio_order(folio) > 1) - folio_prep_large_rmappable(folio); + folio_prep_large_rmappable(folio); return folio; } diff --git a/mm/madvise.c b/mm/madvise.c index a3c509cf2bc93b5644a952ffdd3b4a80a651ef7b..57ad2c766aa6a7ac80f2781563c6fc4b5b29c3d7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -379,6 +379,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; + unsigned int batch_count = 0; + int nr; if (fatal_signal_pending(current)) return -EINTR; @@ -409,7 +411,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pfn_folio(pmd_pfn(orig_pmd)); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -429,7 +431,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, return 0; } - if (pmd_young(orig_pmd)) { + if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -453,21 +455,33 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); +restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { + for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; ptent = ptep_get(pte); + if (++batch_count == SWAP_CLUSTER_MAX) { + batch_count = 0; + if (need_resched()) { + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + cond_resched(); + goto restart; + } + } + if (pte_none(ptent)) continue; @@ -479,55 +493,66 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, continue; /* - * Creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be swapped out whole. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; - - if (folio_estimated_sharers(folio) != 1) - break; - if (pageout_anon_only_filter && !folio_test_anon(folio)) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | + FPB_IGNORE_SOFT_DIRTY; + int max_nr = (end - addr) / PAGE_SIZE; + bool any_young; + + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, + fpb_flags, NULL, &any_young); + if (any_young) + ptent = pte_mkyoung(ptent); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_likely_mapped_shared(folio)) + continue; + if (pageout_anon_only_filter && !folio_test_anon(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } } /* * Do not interfere with other mappings of this folio and - * non-LRU folio. + * non-LRU folio. If we have a large folio at this point, we + * know it is fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) + if (!folio_test_lru(folio) || + folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - - if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + if (!pageout && pte_young(ptent)) { + mkold_ptes(vma, addr, pte, nr); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* @@ -556,7 +581,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, pte_unmap_unlock(start_pte, ptl); } if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); cond_resched(); return 0; @@ -655,6 +680,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct folio *folio; int nr_swap = 0; unsigned long next; + int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) @@ -667,7 +693,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) @@ -682,9 +709,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + nr_swap -= nr; + free_swap_and_cache_nr(entry, nr); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); @@ -704,7 +733,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) break; if (!folio_trylock(folio)) break; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 70223fc704c0a6685dbe206bdbb6a0c777dea05a..f1cf73835cba6ef592492915f0dfd5403fa8f078 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3657,22 +3657,24 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, unsigned int nr) +void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; + unsigned int old_nr = 1 << old_order; + unsigned int new_nr = 1 << new_order; if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < nr; i++) + for (i = new_nr; i < old_nr; i += new_nr) folio_page(folio, i)->memcg_data = folio->memcg_data; if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), nr - 1); + obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, nr - 1); + css_get_many(&memcg->css, old_nr / new_nr - 1); } #ifdef CONFIG_SWAP diff --git a/mm/memory.c b/mm/memory.c index 1bcbc3697a2fe5b145ed257c1f4c23cd59489df0..64e1fd144d9380c02613962e41a42ed22423ed85 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -996,7 +996,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable); + &any_writable, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1563,7 +1563,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL); + NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, @@ -1642,12 +1642,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { - /* Genuine swap entry, hence a private anon page */ + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) continue; - rss[MM_SWAPENTS]--; - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) @@ -1672,8 +1673,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, /* We should have covered all the swap entry types */ WARN_ON_ONCE(1); } - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); @@ -4175,7 +4176,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. @@ -4338,6 +4339,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) pte_unmap(pte); + if (!orders) + goto fallback; + /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { @@ -5040,16 +5044,65 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, return mpol_misplaced(folio, vma, addr); } +static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long fault_addr, pte_t *fault_pte, + bool writable) +{ + pte_t pte, old_pte; + + old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (writable) + pte = pte_mkwrite(pte, vma); + ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + struct folio *folio, pte_t fault_pte, + bool ignore_writable, bool pte_write_upgrade) +{ + int nr = pte_pfn(fault_pte) - folio_pfn(folio); + unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); + unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); + pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; + unsigned long addr; + + /* Restore all PTEs' mapping of the large folio */ + for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(start_ptep); + bool writable = false; + + if (!pte_present(ptent) || !pte_protnone(ptent)) + continue; + + if (pfn_folio(pte_pfn(ptent)) != folio) + continue; + + if (!ignore_writable) { + ptent = pte_modify(ptent, vma->vm_page_prot); + writable = pte_write(ptent); + if (!writable && pte_write_upgrade && + can_change_pte_writable(vma, addr, ptent)) + writable = true; + } + + numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); + } +} + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false; + bool writable = false, ignore_writable = false; + bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0; + int flags = 0, nr_pages; /* * The "pte" at this point cannot be used safely without @@ -5071,7 +5124,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -5079,10 +5132,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* TODO: handle PTE-mapped THP */ - if (folio_test_large(folio)) - goto out_map; - /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5098,10 +5147,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); + nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -5118,6 +5168,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; + ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5138,20 +5189,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, 1, flags); + task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (writable) - pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + if (folio && folio_test_large(folio)) + numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, + pte_write_upgrade); + else + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f28f4c277099764612f9ba23c6abbcc3289c38d0..a80f9975190473b49a25441916919d697ecee902 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -608,12 +608,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it * is shared it is likely not worth migrating. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && + (flags & MPOL_MF_MOVE && !folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) { if (!isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) @@ -1040,11 +1039,10 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, * We try to migrate only unshared folios. If it is shared it * is likely not worth migrating. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, diff --git a/mm/migrate.c b/mm/migrate.c index cd2a24c6a745591f3217498a21a91fffbfd44d63..141509ec9a485a2785f51005043c08db579cfa6e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1653,6 +1653,29 @@ static int migrate_pages_batch(struct list_head *from, cond_resched(); + /* + * The rare folio on the deferred split list should + * be split now. It should not count as a failure. + * Only check it without removing it from the list. + * Since the folio can be on deferred_split_scan() + * local list and removing it can cause the local list + * corruption. Folio split process below can handle it + * with the help of folio_ref_freeze(). + * + * nr_pages > 2 is needed to avoid checking order-1 + * page cache folios. They exist, in contrast to + * non-existent order-1 anonymous folios, and do not + * use _deferred_list. + */ + if (nr_pages > 2 && + !list_empty(&folio->_deferred_list)) { + if (try_split_folio(folio, split_folios) == 0) { + stats->nr_thp_split += is_thp; + stats->nr_split++; + continue; + } + } + /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry @@ -2576,11 +2599,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && + if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; diff --git a/mm/mprotect.c b/mm/mprotect.c index f121c46f6e4c43aa131e72a2b37d63eff7d7c37c..b360577be4f8819637af6a13a4a8da3989d30dff 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -129,7 +129,8 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - folio_ref_count(folio) != 1) + (folio_maybe_dma_pinned(folio) || + folio_likely_mapped_shared(folio))) continue; /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3da29ddfea1aaa6d045e9f41d24e9e1fdf6ea962..dc932c5837479a147eb1cf9604ea75ac30bb52d4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -450,10 +450,11 @@ static int dump_task(struct task_struct *p, void *arg) return 0; } - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), + get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -474,7 +475,7 @@ static int dump_task(struct task_struct *p, void *arg) static void dump_tasks(struct oom_control *oc) { pr_info("Tasks state (memory values in pages):\n"); - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name\n"); if (is_memcg_oom(oc)) { mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e92509d4a5d64f21e61ac52594fb22952479b95c..95bd8f6f7889a2265a753ff911962dfa6bed425e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1912,10 +1912,14 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) unsigned long max_managed, flags; /* - * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. * Check is race-prone but harmless. */ - max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); if (zone->nr_reserved_highatomic >= max_managed) return; @@ -2656,8 +2660,8 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_owner(page, order, 0); + split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -5050,8 +5054,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_owner(page, order, 0); + split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); diff --git a/mm/page_io.c b/mm/page_io.c index af65db0be7310faffa1ab39aa7d599583f0a34c0..ea8d57b9b3ae52507c4ba519ce90833ce75b6970 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(&folio->page); + ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); folio_unlock(folio); diff --git a/mm/page_owner.c b/mm/page_owner.c index e7eba7688881d5abc5b88f4ef71eaa5b9dff7dbc..41993f140f1b35d534acf1e893a538b6be390f66 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -215,7 +215,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, unsigned int nr) +void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); @@ -224,9 +224,9 @@ void __split_page_owner(struct page *page, unsigned int nr) if (unlikely(!page_ext)) return; - for (i = 0; i < nr; i++) { + for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); - page_owner->order = 0; + page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); diff --git a/mm/readahead.c b/mm/readahead.c index 63c7320ba464f50ba5e9571f40e1a650ec2a77cf..689e003951fe72fff2c2230912a8814de78af10a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -231,6 +231,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); + int ret; if (folio && !xa_is_value(folio)) { /* @@ -250,9 +251,12 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (filemap_add_folio(mapping, folio, index + i, - gfp_mask) < 0) { + + ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); + if (ret < 0) { folio_put(folio); + if (ret == -ENOMEM) + break; read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; @@ -493,6 +497,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; + unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -509,6 +514,8 @@ void page_cache_ra_order(struct readahead_control *ractl, new_order--; } + /* See comment in page_cache_ra_unbounded() */ + nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); if (unlikely(!mapping_large_folio_support(mapping))) { @@ -525,9 +532,6 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; - /* THP machinery does not support order-1 */ - if (order == 1) - order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; @@ -541,6 +545,7 @@ void page_cache_ra_order(struct readahead_control *ractl, read_pages(ractl); filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have diff --git a/mm/shmem.c b/mm/shmem.c index 0a39dbdcfb2eda1929794a658ca310a0422a26c6..a7550982a13dd0e3356113390bd60c0e8b3144e3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1893,7 +1893,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index c7781364fa50ff41b392e6af26913f3e60eaf65e..7af3b93d4c8c8a15bcfbf3adce5270830c2ab9d9 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -385,7 +385,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1, type); + cache->slots, 0, type); return cache->nr; } @@ -434,8 +434,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio) if (folio_test_large(folio)) { - if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio), type); + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, &entry, folio_order(folio), type); goto out; } @@ -467,7 +467,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) goto out; } - get_swap_pages(1, &entry, 1, type); + get_swap_pages(1, &entry, 0, type); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index d0636532d1ab7b3ffa65aa7846ddf408578b2e3b..40b84dc47974fc1a03b01529fccd6157e8dc355c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -72,11 +72,11 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - struct page *page; + void *shadow; - page = xa_load(&address_space->i_pages, idx); - if (xa_is_value(page)) - return page; + shadow = xa_load(&address_space->i_pages, idx); + if (xa_is_value(shadow)) + return shadow; return NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index dff84e2e219fd1b0544fc81487e81aa546f9cb75..ddb50283f2f1d4f76e42744d0859b3067f1728c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -129,7 +129,11 @@ static inline unsigned char swap_count(unsigned char ent) /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 -/* returns 1 if swap entry is freed */ +/* + * returns number of pages in the folio that backs the swap entry. If positive, + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no + * folio was associated with the swap entry. + */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { @@ -154,6 +158,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, ret = folio_free_swap(folio); folio_unlock(folio); } + ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); folio_put(folio); return ret; } @@ -272,15 +277,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR -#define swap_entry_size(size) (size) +#define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* - * Define swap_entry_size() as constant to let compiler to optimize + * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ -#define swap_entry_size(size) 1 +#define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 @@ -342,18 +347,6 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } -static inline bool cluster_is_huge(struct swap_cluster_info *info) -{ - if (IS_ENABLED(CONFIG_THP_SWAP)) - return info->flags & CLUSTER_FLAG_HUGE; - return false; -} - -static inline void cluster_clear_huge(struct swap_cluster_info *info) -{ - info->flags &= ~CLUSTER_FLAG_HUGE; -} - static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -557,10 +550,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) /* * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased. + * removed from free cluster list and its usage counter will be increased by + * count. */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static void add_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr, + unsigned long count) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; @@ -569,9 +564,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + 1); + cluster_count(&cluster_info[idx]) + count); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased by 1. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + add_cluster_info_page(p, cluster_info, page_nr, 1); } /* @@ -601,7 +606,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset) + unsigned long offset, int order) { struct percpu_cluster *percpu_cluster; bool conflict; @@ -615,27 +620,42 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); - cluster_set_null(&percpu_cluster->index); + percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; +} + +static inline bool swap_range_empty(char *swap_map, unsigned int start, + unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < nr_pages; i++) { + if (swap_map[start + i]) + return false; + } + return true; } /* - * Try to get a swap entry from current cpu's swap entry pool (a cluster). This - * might involve allocating a new cluster for current CPU too. + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base) + unsigned long *offset, unsigned long *scan_base, int order) { + unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned long tmp, max; + unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); - if (cluster_is_null(&cluster->index)) { + tmp = cluster->next[order]; + if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { - cluster->index = si->free_clusters.head; - cluster->next = cluster_next(&cluster->index) * + tmp = cluster_next(&si->free_clusters.head) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* @@ -653,27 +673,27 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, /* * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster + * check if there is still free entry in the cluster, maintaining + * natural alignment. */ - tmp = cluster->next; - max = min_t(unsigned long, si->max, - (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { - if (!si->swap_map[tmp]) + if (swap_range_empty(si->swap_map, tmp, nr_pages)) break; - tmp++; + tmp += nr_pages; } unlock_cluster(ci); } if (tmp >= max) { - cluster_set_null(&cluster->index); + cluster->next[order] = SWAP_NEXT_INVALID; goto new_cluster; } - cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + tmp += nr_pages; + cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; return true; } @@ -798,13 +818,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, - swp_entry_t slots[]) + swp_entry_t slots[], int order) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; @@ -819,6 +840,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * And we let swap pages go all over an SSD partition. Hugh */ + if (order > 0) { + /* + * Should not even be attempting large allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP) || + nr_pages > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return 0; + } + + /* + * Swapfile is not block device or not using clusters so unable + * to allocate large entries. + */ + if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + return 0; + } + si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on @@ -833,8 +873,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -876,13 +919,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base)) + &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } } if (!(si->flags & SWP_WRITEOK)) @@ -901,7 +947,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ - if (swap_was_freed) + if (swap_was_freed > 0) goto checks; goto scan; /* check next one */ } @@ -913,11 +959,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, else goto done; } - WRITE_ONCE(si->swap_map[offset], usage); - inc_cluster_info_page(si, si->cluster_info, offset); + memset(si->swap_map + offset, usage, nr_pages); + add_cluster_info_page(si, si->cluster_info, offset, nr_pages); unlock_cluster(ci); - swap_range_alloc(si, offset, 1); + swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -938,8 +984,10 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* try to get more slots in cluster */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) goto checks; + if (order > 0) + goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; @@ -966,11 +1014,13 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } done: - set_cluster_next(si, offset + 1); + if (order == 0) + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: + VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { @@ -999,38 +1049,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return n_ret; } -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - unsigned long idx; - struct swap_cluster_info *ci; - unsigned long offset; - - /* - * Should not even be attempting cluster allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (!IS_ENABLED(CONFIG_THP_SWAP)) { - VM_WARN_ON_ONCE(1); - return 0; - } - - if (cluster_list_empty(&si->free_clusters)) - return 0; - - idx = cluster_list_first(&si->free_clusters); - offset = idx * SWAPFILE_CLUSTER; - ci = lock_cluster(si, offset); - alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); - - memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); - unlock_cluster(ci); - swap_range_alloc(si, offset, SWAPFILE_CLUSTER); - *slot = swp_entry(si->type, offset); - - return 1; -} - static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; @@ -1128,18 +1146,16 @@ static inline bool should_skip_swap_type(int swap_type, int type) } #endif -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order, int type) { - unsigned long size = swap_entry_size(entry_size); + int order = swap_entry_order(entry_order); + unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; - /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - spin_lock(&swap_avail_lock); avail_pgs = get_avail_pages(size, type); @@ -1180,14 +1196,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size, spin_unlock(&si->lock); goto nextsi; } - if (size == SWAPFILE_CLUSTER) { - if (si->flags & SWP_BLKDEV) - n_ret = swap_alloc_cluster(si, swp_entries); - } else - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); spin_unlock(&si->lock); - if (n_ret || size == SWAPFILE_CLUSTER) + if (n_ret || size > 1) goto check_out; cond_resched(); @@ -1441,7 +1453,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(folio_nr_pages(folio)); + int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) @@ -1449,7 +1461,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { - VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; @@ -1457,7 +1468,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (val == SWAP_HAS_CACHE) free_entries++; } - cluster_clear_huge(ci); if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); @@ -1479,23 +1489,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unlock_cluster_or_swap_info(si, ci); } -#ifdef CONFIG_THP_SWAP -int split_swap_cluster(swp_entry_t entry) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - - si = _swap_info_get(entry); - if (!si) - return -EBUSY; - ci = lock_cluster(si, offset); - cluster_clear_huge(ci); - unlock_cluster(ci); - return 0; -} -#endif - static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; @@ -1603,22 +1596,23 @@ int swp_swapcount(swp_entry_t entry) } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, - swp_entry_t entry) + swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; + unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); - unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); - if (!ci || !cluster_is_huge(ci)) { + if (!ci || nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } - for (i = 0; i < SWAPFILE_CLUSTER; i++) { + for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; @@ -1640,7 +1634,7 @@ static bool folio_swapped(struct folio *folio) if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; - return swap_page_trans_huge_swapped(si, entry); + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } /** @@ -1686,33 +1680,88 @@ bool folio_free_swap(struct folio *folio) return true; } -/* - * Free the swap entry like above, but also try to - * free the page cache entry if it is the last user. +/** + * free_swap_and_cache_nr() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. + * @entry: First entry of range. + * @nr: Number of entries in range. + * + * For each swap entry in the contiguous range, release a reference. If any swap + * entries become free, try to reclaim their underlying folios, if present. The + * offset range is defined by [entry.offset, entry.offset + nr). */ -int free_swap_and_cache(swp_entry_t entry) +void free_swap_and_cache_nr(swp_entry_t entry, int nr) { - struct swap_info_struct *p; + const unsigned long start_offset = swp_offset(entry); + const unsigned long end_offset = start_offset + nr; + unsigned int type = swp_type(entry); + struct swap_info_struct *si; + bool any_only_cache = false; + unsigned long offset; unsigned char count; if (non_swap_entry(entry)) - return 1; + return; - p = get_swap_device(entry); - if (p) { - if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { - put_swap_device(p); - return 0; + si = get_swap_device(entry); + if (!si) + return; + + if (WARN_ON(end_offset > si->max)) + goto out; + + /* + * First free all entries in the range. + */ + for (offset = start_offset; offset < end_offset; offset++) { + if (data_race(si->swap_map[offset])) { + count = __swap_entry_free(si, swp_entry(type, offset)); + if (count == SWAP_HAS_CACHE) + any_only_cache = true; + } else { + WARN_ON_ONCE(1); } + } + + /* + * Short-circuit the below loop if none of the entries had their + * reference drop to zero. + */ + if (!any_only_cache) + goto out; - count = __swap_entry_free(p, entry); - if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) - __try_to_reclaim_swap(p, swp_offset(entry), + /* + * Now go back over the range trying to reclaim the swap cache. This is + * more efficient for large folios because we will only try to reclaim + * the swap once per folio in the common case. If we do + * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * latter will get a reference and lock the folio for every individual + * page but will only succeed once the swap slot for every subpage is + * zero. + */ + for (offset = start_offset; offset < end_offset; offset += nr) { + nr = 1; + if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + /* + * Folios are always naturally aligned in swap so + * advance forward to the next boundary. Zero means no + * folio was found for the swap entry, so advance by 1 + * in this case. Negative value means folio was found + * but could not be reclaimed. Here we can still advance + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); - put_swap_device(p); + if (nr == 0) + nr = 1; + else if (nr < 0) + nr = -nr; + nr = ALIGN(offset + 1, nr) - offset; + } } - return p != NULL; + +out: + put_swap_device(si); } #ifdef CONFIG_HIBERNATION @@ -1727,7 +1776,7 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: @@ -1885,11 +1934,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, page_folio(page)); - - /* See do_swap_page() */ - BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); - BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + arch_swap_restore(folio_swap(entry, folio), page_folio(page)); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); @@ -3182,7 +3227,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu; + int cpu, i; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; @@ -3218,8 +3263,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, cpu); - cluster_set_null(&cluster->index); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; } } else { atomic_inc(&nr_rotate_swap); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8a66a0dcb577b5518391f67eea3740b514c16d4..e6058942a0843872fdde61cc1cf21e6e2a88c931 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2694,7 +2694,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) * get_order(0) returns funny result. Just warn and terminate * early. */ - return NULL; + return ERR_PTR(-EINVAL); } order = get_order(size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 95a8459056246283d0cc82a493efe235f116872d..7e9cbb2bc454df47adadbe987c54487f2f675291 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1903,25 +1903,25 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split folios without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - if (!folio_entire_mapcount(folio) && - split_folio_to_list(folio, - folio_list)) + if (data_race(!list_empty(&folio->_deferred_list)) && + split_folio_to_list(folio, folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, - folio_list)) + if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); - count_vm_event(THP_SWPOUT_FALLBACK); + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } #endif if (!add_to_swap(folio)) goto activate_locked_split; @@ -1954,6 +1954,20 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; + /* + * Without TTU_SYNC, try_to_unmap will only begin to + * hold PTL from the first present PTE within a large + * folio. Some initial PTEs might be skipped due to + * races with parallel PTE writes in which PTEs can be + * cleared temporarily before being written new present + * values. This will lead to a large folio is still + * mapped while some subpages have been partially + * unmapped after try_to_unmap; TTU_SYNC helps + * try_to_unmap acquire PTL from the first PTE, + * eliminating the influence of temporary PTE values. + */ + if (folio_test_large(folio) && list_empty(&folio->_deferred_list)) + flags |= TTU_SYNC; try_to_unmap(folio, flags); if (folio_mapped(folio)) { @@ -2792,7 +2806,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat) + struct pglist_data *pgdat, + bool ignore_references) { struct reclaim_stat dummy_stat; unsigned int nr_reclaimed; @@ -2805,7 +2820,7 @@ unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2815,7 +2830,7 @@ unsigned int reclaim_folio_list(struct list_head *folio_list, return nr_reclaimed; } -unsigned long reclaim_pages(struct list_head *folio_list) +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references) { int nid; unsigned int nr_reclaimed = 0; @@ -2837,11 +2852,12 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), + ignore_references); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references); memalloc_noreclaim_restore(noreclaim_flag); diff --git a/mm/vmstat.c b/mm/vmstat.c index 6bed5bcb8208042f7d325bf7dff62f512955b142..9a0c40d804e60414cf4916b2b2b362c006734385 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -855,8 +855,10 @@ static int refresh_cpu_vm_stats(bool do_pagesets) continue; } - if (__this_cpu_dec_return(pcp->expire)) + if (__this_cpu_dec_return(pcp->expire)) { + changes++; continue; + } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); diff --git a/mm/workingset.c b/mm/workingset.c index 2559a1f2fc1cfa70352e75b3963ff762e2629d03..9110957bec5b30ec41b51b722c1d63f452c50dc3 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -664,7 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_ratelimited(); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 171c929826d55956c2ab2671a8b2b103a5ef5c21..198c4db57c9edfebc98e3a97c81f3434709a6e9a 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -351,7 +351,27 @@ CATEGORY="thp" run_test ./khugepaged -s 2 CATEGORY="thp" run_test ./transhuge-stress -d 20 -CATEGORY="thp" run_test ./split_huge_page_test +# Try to create XFS if not provided +if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then + if test_selected "thp"; then + if grep xfs /proc/filesystems &>/dev/null; then + XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) + SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) + truncate -s 314572800 ${XFS_IMG} + mkfs.xfs -q ${XFS_IMG} + mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + MOUNTED_XFS=1 + fi + fi +fi + +CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + +if [ -n "${MOUNTED_XFS}" ]; then + umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rm -f ${XFS_IMG} +fi CATEGORY="migration" run_test ./migration diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index dff3be23488b425bd63d6841591cbf6047b6f382..d3c7f5fb3e7b778dc5e0e36eb71da09372f9acef 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -16,17 +16,20 @@ #include #include #include +#include #include "vm_util.h" +#include "../kselftest.h" uint64_t pagesize; unsigned int pageshift; uint64_t pmd_pagesize; #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" +#define PID_FMT "%d,0x%lx,0x%lx,%d" +#define PATH_FMT "%s,0x%lx,0x%lx,%d" #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -50,21 +53,19 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) return 0; } -static int write_file(const char *path, const char *buf, size_t buflen) +static void write_file(const char *path, const char *buf, size_t buflen) { int fd; ssize_t numwritten; fd = open(path, O_WRONLY); if (fd == -1) - return 0; + ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno)); numwritten = write(fd, buf, buflen - 1); close(fd); if (numwritten < 1) - return 0; - - return (unsigned int) numwritten; + ksft_exit_fail_msg("Write failed\n"); } static void write_debugfs(const char *fmt, ...) @@ -77,15 +78,10 @@ static void write_debugfs(const char *fmt, ...) ret = vsnprintf(input, INPUT_MAX, fmt, argp); va_end(argp); - if (ret >= INPUT_MAX) { - printf("%s: Debugfs input is too long\n", __func__); - exit(EXIT_FAILURE); - } + if (ret >= INPUT_MAX) + ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__); - if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { - perror(SPLIT_DEBUGFS); - exit(EXIT_FAILURE); - } + write_file(SPLIT_DEBUGFS, input, ret + 1); } void split_pmd_thp(void) @@ -95,39 +91,30 @@ void split_pmd_thp(void) size_t i; one_page = memalign(pmd_pagesize, len); - - if (!one_page) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } + if (!one_page) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); madvise(one_page, len, MADV_HUGEPAGE); for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); + (uint64_t)one_page + len, 0); for (i = 0; i < len; i++) - if (one_page[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (one_page[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); - if (!check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 0, pmd_pagesize)) + ksft_exit_fail_msg("Still AnonHugePages not split\n"); - printf("Split huge pages successful\n"); + ksft_test_result_pass("Split huge pages successful\n"); free(one_page); } @@ -143,36 +130,29 @@ void split_pte_mapped_thp(void) int pagemap_fd; int kpageflags_fd; - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { - perror("get pagemap proc error"); - exit(EXIT_FAILURE); - } - pagemap_fd = open(pagemap_proc, O_RDONLY); + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) + ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno)); - if (pagemap_fd == -1) { - perror("read pagemap:"); - exit(EXIT_FAILURE); - } + pagemap_fd = open(pagemap_proc, O_RDONLY); + if (pagemap_fd == -1) + ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno)); kpageflags_fd = open(kpageflags_proc, O_RDONLY); - - if (kpageflags_fd == -1) { - perror("read kpageflags:"); - exit(EXIT_FAILURE); - } + if (kpageflags_fd == -1) + ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno)); one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (one_page == MAP_FAILED) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); madvise(one_page, len, MADV_HUGEPAGE); for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); /* remap the first pagesize of first THP */ pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); @@ -183,10 +163,8 @@ void split_pte_mapped_thp(void) pagesize, pagesize, MREMAP_MAYMOVE|MREMAP_FIXED, pte_mapped + pagesize * i); - if (pte_mapped2 == (char *)-1) { - perror("mremap failed"); - exit(EXIT_FAILURE); - } + if (pte_mapped2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno)); } /* smap does not show THPs after mremap, use kpageflags instead */ @@ -196,33 +174,28 @@ void split_pte_mapped_thp(void) is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) thp_size++; - if (thp_size != 4) { - printf("Some THPs are missing during mremap\n"); - exit(EXIT_FAILURE); - } + if (thp_size != 4) + ksft_exit_fail_msg("Some THPs are missing during mremap\n"); /* split all remapped THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); + (uint64_t)pte_mapped + pagesize * 4, 0); /* smap does not show THPs after mremap, use kpageflags instead */ thp_size = 0; for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (pte_mapped[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); + if (i % pagesize == 0 && is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) thp_size++; } - if (thp_size) { - printf("Still %ld THPs not split\n", thp_size); - exit(EXIT_FAILURE); - } + if (thp_size) + ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size); - printf("Split PTE-mapped huge pages successful\n"); + ksft_test_result_pass("Split PTE-mapped huge pages successful\n"); munmap(one_page, len); close(pagemap_fd); close(kpageflags_fd); @@ -238,24 +211,21 @@ void split_file_backed_thp(void) char testfile[INPUT_MAX]; uint64_t pgoff_start = 0, pgoff_end = 1024; - printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); - if (status) { - printf("Unable to create a tmpfs for testing\n"); - exit(EXIT_FAILURE); - } + if (status) + ksft_exit_fail_msg("Unable to create a tmpfs for testing\n"); status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); if (status >= INPUT_MAX) { - printf("Fail to create file-backed THP split testing file\n"); - goto cleanup; + ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); } fd = open(testfile, O_CREAT|O_WRONLY, 0664); if (fd == -1) { - perror("Cannot open testing file\n"); + ksft_perror("Cannot open testing file"); goto cleanup; } @@ -264,50 +234,213 @@ void split_file_backed_thp(void) close(fd); if (num_written < 1) { - printf("Fail to write data to testing file\n"); + ksft_perror("Fail to write data to testing file"); goto cleanup; } /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); status = unlink(testfile); - if (status) - perror("Cannot remove testing file\n"); + if (status) { + ksft_perror("Cannot remove testing file"); + goto cleanup; + } -cleanup: status = umount(tmpfs_loc); if (status) { - printf("Unable to umount %s\n", tmpfs_loc); - exit(EXIT_FAILURE); + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc); } + status = rmdir(tmpfs_loc); - if (status) { - perror("cannot remove tmp dir"); - exit(EXIT_FAILURE); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); + + ksft_print_msg("Please check dmesg for more information\n"); + ksft_test_result_pass("File-backed THP split test done\n"); + return; + +cleanup: + umount(tmpfs_loc); + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Error occurred\n"); +} + +bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, + const char **thp_fs_loc) +{ + if (xfs_path) { + *thp_fs_loc = xfs_path; + return false; + } + + *thp_fs_loc = mkdtemp(thp_fs_template); + + if (!*thp_fs_loc) + ksft_exit_fail_msg("cannot create temp folder\n"); + + return true; +} + +void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) +{ + int status; + + if (!created_tmp) + return; + + status = rmdir(thp_fs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", + strerror(errno)); +} + +int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, + char **addr) +{ + size_t i; + int __attribute__((unused)) dummy = 0; + + srand(time(NULL)); + + *fd = open(testfile, O_CREAT | O_RDWR, 0664); + if (*fd == -1) + ksft_exit_fail_msg("Failed to create a file at %s\n", testfile); + + for (i = 0; i < fd_size; i++) { + unsigned char byte = (unsigned char)i; + + write(*fd, &byte, sizeof(byte)); + } + close(*fd); + sync(); + *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + if (*fd == -1) { + ksft_perror("open drop_caches"); + goto err_out_unlink; + } + if (write(*fd, "3", 1) != 1) { + ksft_perror("write to drop_caches"); + goto err_out_unlink; + } + close(*fd); + + *fd = open(testfile, O_RDWR); + if (*fd == -1) { + ksft_perror("Failed to open testfile\n"); + goto err_out_unlink; + } + + *addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*addr == (char *)-1) { + ksft_perror("cannot mmap"); + goto err_out_close; + } + madvise(*addr, fd_size, MADV_HUGEPAGE); + + for (size_t i = 0; i < fd_size; i++) + dummy += *(*addr + i); + + if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { + ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); + munmap(*addr, fd_size); + close(*fd); + unlink(testfile); + ksft_test_result_skip("Pagecache folio split skipped\n"); + return -2; + } + return 0; +err_out_close: + close(*fd); +err_out_unlink: + unlink(testfile); + ksft_exit_fail_msg("Failed to create large pagecache folios\n"); + return -1; +} + +void split_thp_in_pagecache_to_order(size_t fd_size, int order, const char *fs_loc) +{ + int fd; + char *addr; + size_t i; + char testfile[INPUT_MAX]; + int err = 0; + + err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc); + + if (err < 0) + ksft_exit_fail_msg("cannot generate right test file name\n"); + + err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr); + if (err) + return; + err = 0; + + write_debugfs(PID_FMT, getpid(), (uint64_t)addr, (uint64_t)addr + fd_size, order); + + for (i = 0; i < fd_size; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("%lu byte corrupted in the file\n", i); + err = EXIT_FAILURE; + goto out; + } + + if (!check_huge_file(addr, 0, pmd_pagesize)) { + ksft_print_msg("Still FilePmdMapped not split\n"); + err = EXIT_FAILURE; + goto out; } - printf("file-backed THP split test done, please check dmesg for more information\n"); +out: + munmap(addr, fd_size); + close(fd); + unlink(testfile); + if (err) + ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order); + ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order); } int main(int argc, char **argv) { + int i; + size_t fd_size; + char *optional_xfs_path = NULL; + char fs_loc_template[] = "/tmp/thp_fs_XXXXXX"; + const char *fs_loc; + bool created_tmp; + + ksft_print_header(); + if (geteuid() != 0) { - printf("Please run the benchmark as root\n"); - exit(EXIT_FAILURE); + ksft_print_msg("Please run the benchmark as root\n"); + ksft_finished(); } + if (argc > 1) + optional_xfs_path = argv[1]; + + ksft_set_plan(3+9); + pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); - if (!pmd_pagesize) { - printf("Reading PMD pagesize failed\n"); - exit(EXIT_FAILURE); - } + if (!pmd_pagesize) + ksft_exit_fail_msg("Reading PMD pagesize failed\n"); + + fd_size = 2 * pmd_pagesize; split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); + created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, + &fs_loc); + for (i = 8; i >= 0; i--) + split_thp_in_pagecache_to_order(fd_size, i, fs_loc); + cleanup_thp_fs(fs_loc, created_tmp); + + ksft_finished(); + return 0; }