diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8c199fe368c2e7849e061d2872173161cdce0232..b2a80e089a0acf9fd1b73b84da93a7b8c8e0f8c6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -829,18 +829,16 @@ static inline void mem_cgroup_uncharge(struct folio *folio) __mem_cgroup_uncharge(folio); } -void __mem_cgroup_uncharge_list(struct list_head *page_list); -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge_list(page_list); + __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); - void mem_cgroup_replace_folio(struct folio *old, struct folio *new); - void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1421,7 +1419,7 @@ static inline void mem_cgroup_uncharge(struct folio *folio) { } -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } @@ -1829,18 +1827,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/mm.h b/include/linux/mm.h index 49f4fac2dcf76ff8e129618968910dcb27d936cd..9b71b877c8d3eab18a90ba6c56f3033a4d4dbb4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -37,6 +37,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -227,7 +228,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); @@ -1321,8 +1321,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -void destroy_large_folio(struct folio *folio); - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { @@ -1532,6 +1530,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1554,18 +1554,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87cc678adc850b12f144052fc7b4cf50d32ad107..67f10b8810a8a973b61f0a269ff5a8a09c1e30ed 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -11,8 +11,8 @@ #include -/* 15 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 15 +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 struct folio; diff --git a/include/linux/swap.h b/include/linux/swap.h index 13cd68b5f5e26e76eba0c01c3d9373cecfe8b944..54fa8f4558c75650a05ed70d15af409c3ffb2619 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -482,9 +482,9 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_page_and_swap_cache(struct page *); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -577,7 +577,7 @@ static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) { } -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index be39ca5af0ba003e4f0fa46322d8561bee875e4f..a4e40ae6a8c8fd2e51cdbf877cf3bb485690cc9a 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -304,6 +304,44 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +TRACE_EVENT(mm_alloc_contig_migrate_range_info, + + TP_PROTO(unsigned long start, + unsigned long end, + unsigned long nr_migrated, + unsigned long nr_reclaimed, + unsigned long nr_mapped, + int migratetype), + + TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_mapped) + __field(int, migratetype) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->nr_migrated = nr_migrated; + __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_mapped = nr_mapped; + __entry->migratetype = migratetype; + ), + + TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu", + __entry->start, + __entry->end, + __entry->migratetype, + __entry->nr_migrated, + __entry->nr_reclaimed, + __entry->nr_mapped) +); + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ diff --git a/mm/dynamic_pool.c b/mm/dynamic_pool.c index b1590362c2c99c9174bb0e728d3ab3b34c03b025..41a627431ea6956b96f3dcfc80d510917bb2b6b5 100644 --- a/mm/dynamic_pool.c +++ b/mm/dynamic_pool.c @@ -269,6 +269,7 @@ static int dpool_demote_huge_page(struct pages_pool *src_pool, __ClearPageDpool(page); src_pool->free_pages--; + __folio_clear_hugetlb(page_folio(page)); clear_compound_page(page_folio(page), PMD_ORDER); for (i = 0; i < nr_pages; i++) { subpage = folio_page(folio, i); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 763bb25e4f9934a2399bee190c8698b341221121..18ec0e138b031fa11ebeb5d0e7ae0b44b83ab712 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -952,8 +952,6 @@ void folio_prep_large_rmappable(struct folio *folio) { if (!folio || !folio_test_large(folio)) return; - if (folio_order(folio) > 1) - INIT_LIST_HEAD(&folio->_deferred_list); folio_set_large_rmappable(folio); } @@ -1227,11 +1225,13 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f90d0845c43ae496d009d46e021668ceca02dd6..7cfe80bb2cb7532d6c755ce2355d58b0db0e0679 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1761,7 +1761,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, &folio->page)) { + /* + * If folio is not vmemmap optimized (!clear_dtor), then the folio + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * can only be passed hugetlb pages and will BUG otherwise. + */ + if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1784,7 +1789,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, * If vmemmap pages were allocated above, then we need to clear the * hugetlb destructor under the hugetlb lock. */ - if (clear_dtor) { + if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); @@ -1799,7 +1804,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(&folio->page, huge_page_order(h)); + INIT_LIST_HEAD(&folio->_deferred_list); + folio_put(folio); } } @@ -1979,9 +1985,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { + __folio_set_hugetlb(folio); hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - __folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2406,17 +2412,23 @@ int dissolve_free_huge_page(struct page *page) * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. + * + * The folio_test_hugetlb check here is because + * remove_hugetlb_folio will clear hugetlb folio flag for + * non-vmemmap optimized hugetlb folios. */ - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (!rc) { - update_and_free_hugetlb_folio(h, folio, false); - } else { - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - h->max_huge_pages++; - spin_unlock_irq(&hugetlb_lock); - } + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, false); + h->max_huge_pages++; + goto out; + } + } else + rc = 0; + update_and_free_hugetlb_folio(h, folio, false); return rc; } out: @@ -3786,13 +3798,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; + /* + * If vmemmap already existed for folio, the remove routine above would + * have cleared the hugetlb folio flag. Hence the folio is technically + * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * passed hugetlb folios and will BUG otherwise. + */ + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote folio */ + spin_lock_irq(&hugetlb_lock); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); + return rc; + } } /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index fb0b05d4659ab68b4d5d0f2692302c7a0e7eb0f1..149ab629855c11ebec2da97d1ce3c998820ea708 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "hugetlb_vmemmap.h" @@ -487,6 +488,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!HPageVmemmapOptimized(head)) return 0; @@ -584,6 +586,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!vmemmap_should_optimize(h, head)) return; diff --git a/mm/internal.h b/mm/internal.h index 6451747b7160566367dd9a10c218f40ca041bc07..0ecbaa392054e0f57035445051bbb80e60edfbe5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -282,6 +282,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); @@ -610,6 +611,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order) atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) @@ -635,8 +638,8 @@ extern void dpool_prep_new_page(struct page *page, unsigned int order, #endif extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page, unsigned int order); -extern void free_unref_page_list(struct list_head *list); +void free_unref_page(struct page *page, unsigned int order); +void free_unref_folios(struct folio_batch *fbatch); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); @@ -725,7 +728,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + int migratetype); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); @@ -1181,7 +1185,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); -void free_zone_device_page(struct page *page); +void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_page(struct page *page); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7d329e9eeec85b6a5186d0c792d4f894255ff98b..5f999528ec30811364721530c639a294c8d4e6f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -695,9 +695,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { - struct folio *src_folio; - struct page *src_page; - struct page *tmp; + struct folio *src, *tmp; pte_t *_pte; pte_t pteval; @@ -716,10 +714,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { - src_page = pte_page(pteval); - src_folio = page_folio(src_page); - if (!folio_test_large(src_folio)) - release_pte_folio(src_folio); + struct page *src_page = pte_page(pteval); + + src = page_folio(src_page); + if (!folio_test_large(src)) + release_pte_folio(src); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -728,20 +727,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); add_reliable_page_counter(src_page, vma->vm_mm, 1); - folio_remove_rmap_pte(src_folio, src_page, vma); + folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); + list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { + list_del(&src->lru); + node_stat_sub_folio(src, NR_ISOLATED_ANON + + folio_is_file_lru(src)); + folio_unlock(src); + free_swap_cache(src); + folio_putback_lru(src); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fff8b93225219c6b5e99e6b114516f0aa3e02e02..f903714eacb10130334619bae32b02700c4efc9b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -8517,6 +8518,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) > 1 && + !folio_test_hugetlb(folio) && + !list_empty(&folio->_deferred_list), folio); /* * Nobody should be changing or seriously looking at @@ -8582,21 +8586,14 @@ void __mem_cgroup_uncharge(struct folio *folio) uncharge_batch(&ug); } -/** - * __mem_cgroup_uncharge_list - uncharge a list of page - * @page_list: list of pages to uncharge - * - * Uncharge a list of pages previously charged with - * __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_folios(struct folio_batch *folios) { struct uncharge_gather ug; - struct folio *folio; + unsigned int i; uncharge_gather_clear(&ug); - list_for_each_entry(folio, page_list, lru) - uncharge_folio(folio, &ug); + for (i = 0; i < folios->nr; i++) + uncharge_folio(folios->folios[i], &ug); if (ug.memcg) uncharge_batch(&ug); } diff --git a/mm/memory.c b/mm/memory.c index 4ef917a182f9f49c41d24a0b4df2c2a94b538b02..fa4d1b499511043f10284ccdea10de68b58bbc17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3215,19 +3215,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } -static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +/** + * vmf_anon_prepare - Prepare to handle an anonymous fault. + * @vmf: The vm_fault descriptor passed from the fault handler. + * + * When preparing to insert an anonymous page into a VMA from a + * fault handler, call this function rather than anon_vma_prepare(). + * If this vma does not already have an associated anon_vma and we are + * only protected by the per-VMA lock, the caller must retry with the + * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to + * determine if this VMA can share its anon_vma, and that's not safe to + * do with only the per-VMA lock held for this VMA. + * + * Return: 0 if fault handling can proceed. Any other value should be + * returned to the caller. + */ +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; + if (!mmap_read_trylock(vma->vm_mm)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } } if (__anon_vma_prepare(vma)) - return VM_FAULT_OOM; - return 0; + ret = VM_FAULT_OOM; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + mmap_read_unlock(vma->vm_mm); + return ret; } /* @@ -3390,7 +3410,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(new_folio); if (old_folio) { if (page_copied) - free_swap_cache(&old_folio->page); + free_swap_cache(old_folio); folio_put(old_folio); } @@ -4418,8 +4438,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } /* Allocate our own private page. */ - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) @@ -5803,15 +5824,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, if (!vma_start_read(vma)) goto inval; - /* - * find_mergeable_anon_vma uses adjacent vmas which are not locked. - * This check must happen after vma_start_read(); otherwise, a - * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA - * from its anon_vma. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) - goto inval_end_read; - /* Check since vm_start/vm_end might change before we lock the VMA */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; diff --git a/mm/memremap.c b/mm/memremap.c index bee85560a243403006cddaa5759dc5156f746de8..7b7e59841250276154659a20442a67a739bf6f35 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -468,21 +468,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -void free_zone_device_page(struct page *page) +void free_zone_device_folio(struct folio *folio) { - if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) + if (WARN_ON_ONCE(!folio->page.pgmap->ops || + !folio->page.pgmap->ops->page_free)) return; - mem_cgroup_uncharge(page_folio(page)); + mem_cgroup_uncharge(folio); /* * Note: we don't expect anonymous compound pages yet. Once supported * and we could PTE-map them similar to THP, we'd have to clear * PG_anon_exclusive on all tail pages. */ - VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); + if (folio_test_anon(folio)) { + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + __ClearPageAnonExclusive(folio_page(folio, 0)); + } /* * When a device managed page is freed, the page->mapping field @@ -503,20 +505,20 @@ void free_zone_device_page(struct page *page) * * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need - * to clear page->mapping. + * to clear folio->mapping. */ - page->mapping = NULL; - page->pgmap->ops->page_free(page); + folio->mapping = NULL; + folio->page.pgmap->ops->page_free(folio_page(folio, 0)); - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_COHERENT) + if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE && + folio->page.pgmap->type != MEMORY_DEVICE_COHERENT) /* - * Reset the page count to 1 to prepare for handing out the page + * Reset the refcount to 1 to prepare for handing out the page * again. */ - set_page_count(page, 1); + folio_set_count(folio, 1); else - put_dev_pagemap(page->pgmap); + put_dev_pagemap(folio->page.pgmap); } void zone_device_page_init(struct page *page) diff --git a/mm/mlock.c b/mm/mlock.c index f79d8262c1a0f2ddac75f17c607c17e40111ed9b..d0b06ea3b7214f4717d68f1fa44882d0857fb60f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) if (lruvec) unlock_page_lruvec_irq(lruvec); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } void mlock_drain_local(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f225f412e71d2de5531628a7c912b13c57f8cda8..5adb0fd951318369994e1de810258a2fb36001dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -566,14 +567,6 @@ static inline bool pcp_allowed_order(unsigned int order) return false; } -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (pcp_allowed_order(order)) /* Via pcp? */ - free_unref_page(page, order); - else - __free_pages_ok(page, order, FPI_NONE); -} - /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -598,20 +591,6 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } -void destroy_large_folio(struct folio *folio) -{ - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - return; - } - - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - - mem_cgroup_uncharge(folio); - free_the_page(&folio->page, folio_order(folio)); -} - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -1007,10 +986,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) } break; case 2: - /* - * the second tail page: ->mapping is - * deferred_list.next -- ignore value. - */ + /* the second tail page: deferred_list overlaps ->mapping */ + if (unlikely(!list_empty(&folio->_deferred_list))) { + bad_page(page, "on deferred list"); + goto out; + } break; default: if (page->mapping != TAIL_MAPPING) { @@ -2249,12 +2229,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + int count = READ_ONCE(pcp->count); + + while (count) { + int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + count -= to_drain; - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, pcp->count, pcp, 0); + free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); } } @@ -2522,6 +2505,11 @@ void free_unref_page(struct page *page, unsigned int order) return; } + if (!pcp_allowed_order(order)) { + __free_pages_ok(page, order, FPI_NONE); + return; + } + if (!free_unref_page_prepare(page, pfn, order)) return; @@ -2554,73 +2542,75 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a list of 0-order pages + * Free a batch of folios */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; + int i, j, migratetype; - /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned long pfn = folio_pfn(folio); + unsigned int order = folio_order(folio); - if (page_from_dynamic_pool(page)) { - list_del(&page->lru); - dynamic_pool_free_page(page); + if (page_from_dynamic_pool(&folio->page)) { + dynamic_pool_free_page(&folio->page); continue; } - if (!free_unref_page_prepare(page, pfn, 0)) { - list_del(&page->lru); + if (order > 0 && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (!free_unref_page_prepare(&folio->page, pfn, order)) continue; - } /* - * Free isolated pages directly to the allocator, see - * comment in free_unref_page. + * Free isolated folios and orders not handled on the PCP + * directly to the allocator, see comment in free_unref_page. */ - migratetype = get_pcppage_migratetype(page); - if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + migratetype = get_pcppage_migratetype(&folio->page); + if (!pcp_allowed_order(order) || + is_migrate_isolate(migratetype)) { + free_one_page(folio_zone(folio), &folio->page, pfn, + order, migratetype, FPI_NONE); continue; } + folio->private = (void *)(unsigned long)order; + if (j != i) + folios->folios[j] = folio; + j++; } + folios->nr = j; - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + struct zone *zone = folio_zone(folio); + unsigned int order = (unsigned long)folio->private; - list_del(&page->lru); - migratetype = get_pcppage_migratetype(page); + folio->private = NULL; + migratetype = get_pcppage_migratetype(&folio->page); - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * pages. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + /* Different zone requires a different pcp lock */ + if (zone != locked_zone) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } - batch_count = 0; - /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, page_to_pfn(page), - 0, migratetype, FPI_NONE); + free_one_page(zone, &folio->page, + folio_pfn(folio), order, + migratetype, FPI_NONE); locked_zone = NULL; continue; } @@ -2634,15 +2624,16 @@ void free_unref_page_list(struct list_head *list) if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); - batch_count++; + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); } /* @@ -4905,10 +4896,10 @@ void __free_pages(struct page *page, unsigned int order) int head = PageHead(page); if (put_page_testzero(page)) - free_the_page(page, order); + free_unref_page(page, order); else if (!head) while (order-- > 0) - free_the_page(page + (1 << order), order); + free_unref_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages); @@ -4959,7 +4950,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -5000,7 +4991,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, goto refill; if (unlikely(nc->pfmemalloc)) { - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); goto refill; } @@ -5044,7 +5035,7 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); @@ -6471,9 +6462,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* [start, end) must belong to a single zone. */ +/* + * [start, end) must belong to a single zone. + * @migratetype: using migratetype to filter the type of migration in + * trace_mm_alloc_contig_migrate_range_info. + */ int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6484,6 +6480,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc, .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + struct page *page; + unsigned long total_mapped = 0; + unsigned long total_migrated = 0; + unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6509,9 +6509,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; + if (trace_mm_alloc_contig_migrate_range_info_enabled()) { + total_reclaimed += nr_reclaimed; + list_for_each_entry(page, &cc->migratepages, lru) + total_mapped += page_mapcount(page); + } + ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) + total_migrated += cc->nr_migratepages; + /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6525,9 +6534,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc, if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); - return ret; } - return 0; + + trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, + total_migrated, + total_reclaimed, + total_mapped); + return (ret < 0) ? ret : 0; } /** @@ -6607,7 +6620,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); + ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; ret = 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fefc8a9269447a077c6cf2b2aab01a1fbb046503..09eb445cfde9ce921d3999f658fb3921c8b1c6cd 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -437,7 +437,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, } ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages); + head_pfn + nr_pages, page_mt); /* * restore the page's migratetype so that it can diff --git a/mm/rmap.c b/mm/rmap.c index 88345e743c4f62e2fe6a03e0aec5b9bba5ace6fa..27f8881be2ad416b5b8bbfb89052f791a38d3690 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. - * - * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { @@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; + mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); diff --git a/mm/swap.c b/mm/swap.c index e5380d732c0dfbbb1bf684e3f3052b7891565289..1c9e8f70d6b5c823c95ad20a4345787dbb32449a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,22 +74,21 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; -/* - * This path almost never happens for VM activity - pages are normally freed - * in batches. But it gets used by networking - and for compound pages. - */ -static void __page_cache_release(struct folio *folio) +static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) { if (folio_test_lru(folio)) { - struct lruvec *lruvec; - unsigned long flags; - - lruvec = folio_lruvec_lock_irqsave(folio, &flags); - lruvec_del_folio(lruvec, folio); + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); - unlock_page_lruvec_irqrestore(lruvec, flags); } - /* See comment on folio_test_mlocked in release_pages() */ + + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); @@ -99,34 +98,35 @@ static void __page_cache_release(struct folio *folio) } } -static void __folio_put_small(struct folio *folio) +/* + * This path almost never happens for VM activity - pages are normally freed + * in batches. But it gets used by networking - and for compound pages. + */ +static void page_cache_release(struct folio *folio) { - __page_cache_release(folio); - mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, 0); -} + struct lruvec *lruvec = NULL; + unsigned long flags; -static void __folio_put_large(struct folio *folio) -{ - /* - * __page_cache_release() is supposed to be called for thp, not for - * hugetlb. This is because hugetlb page does never have PageLRU set - * (it's never listed to any LRU lists) and no memcg routines should - * be called for hugetlb (it has a separate hugetlb_cgroup.) - */ - if (!folio_test_hugetlb(folio)) - __page_cache_release(folio); - destroy_large_folio(folio); + __page_cache_release(folio, &lruvec, &flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { - if (unlikely(folio_is_zone_device(folio))) - free_zone_device_page(&folio->page); - else if (unlikely(folio_test_large(folio))) - __folio_put_large(folio); - else - __folio_put_small(folio); + if (unlikely(folio_is_zone_device(folio))) { + free_zone_device_folio(folio); + return; + } else if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + + page_cache_release(folio); + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -138,22 +138,25 @@ EXPORT_SYMBOL(__folio_put); */ void put_pages_list(struct list_head *pages) { + struct folio_batch fbatch; struct folio *folio, *next; + folio_batch_init(&fbatch); list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) { - list_del(&folio->lru); + if (!folio_put_testzero(folio)) continue; - } - if (folio_test_large(folio)) { - list_del(&folio->lru); - __folio_put_large(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); } - free_unref_page_list(pages); + if (fbatch.nr) + free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); @@ -175,7 +178,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear the mlocked flag after + * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { @@ -213,7 +216,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); @@ -221,8 +224,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, @@ -946,47 +948,29 @@ void lru_cache_disable(void) } /** - * release_pages - batched put_page() - * @arg: array of pages to release - * @nr: number of pages + * folios_put_refs - Reduce the reference count on a batch of folios. + * @folios: The folios. + * @refs: The number of refs to subtract from each folio. * - * Decrement the reference count on all the pages in @arg. If it - * fell to zero, remove the page from the LRU and free it. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need + * to reinitialise it. If @refs is NULL, we subtract one from each + * folio refcount. * - * Note that the argument can be an array of pages, encoded pages, - * or folio pointers. We ignore any encoded bits, and turn any of - * them into just a folio that gets free'd. + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. */ -void release_pages(release_pages_arg arg, int nr) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { - int i; - struct encoded_page **encoded = arg.encoded_pages; - LIST_HEAD(pages_to_free); + int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; - unsigned int lock_batch; - for (i = 0; i < nr; i++) { - unsigned int nr_refs = 1; - struct folio *folio; - - /* Turn any of the argument types into a folio */ - folio = page_folio(encoded_page_ptr(encoded[i])); - - /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ - if (unlikely(encoded_page_flags(encoded[i]) & - ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - nr_refs = encoded_nr_pages(encoded[++i]); - - /* - * Make sure the IRQ-safe lock-holding time does not get - * excessive with a continuous string of pages from the - * same lruvec. The lock is held only if lruvec != NULL. - */ - if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_page(&folio->page)) continue; @@ -999,53 +983,82 @@ void release_pages(release_pages_arg arg, int nr) if (put_devmap_managed_page_refs(&folio->page, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) - free_zone_device_page(&folio->page); + free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; - if (folio_test_large(folio)) { + /* hugetlb has its own memcg */ + if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __folio_put_large(folio); + free_huge_folio(folio); continue; } + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); - if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; + __page_cache_release(folio, &lruvec, &flags); - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; + if (j != i) + folios->folios[j] = folio; + j++; + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + if (!j) { + folio_batch_reinit(folios); + return; + } - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); - } + folios->nr = j; + mem_cgroup_uncharge_folios(folios); + free_unref_folios(folios); +} +EXPORT_SYMBOL(folios_put_refs); - /* - * In rare cases, when truncation or holepunching raced with - * munlock after VM_LOCKED was cleared, Mlocked may still be - * found set here. This does not indicate a problem, unless - * "unevictable_pgs_cleared" appears worryingly large. - */ - if (unlikely(folio_test_mlocked(folio))) { - __folio_clear_mlocked(folio); - zone_stat_sub_folio(folio, NR_MLOCK); - count_vm_event(UNEVICTABLE_PGCLEARED); - } +/** + * release_pages - batched put_page() + * @arg: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @arg. If it + * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. + */ +void release_pages(release_pages_arg arg, int nr) +{ + struct folio_batch fbatch; + int refs[PAGEVEC_SIZE]; + struct encoded_page **encoded = arg.encoded_pages; + int i; + + folio_batch_init(&fbatch); + for (i = 0; i < nr; i++) { + /* Turn any of the argument types into a folio */ + struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); - list_add(&folio->lru, &pages_to_free); + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + refs[fbatch.nr] = 1; + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); + + if (folio_batch_add(&fbatch, folio) > 0) + continue; + folios_put_refs(&fbatch, refs); } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); + if (fbatch.nr) + folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); @@ -1065,8 +1078,7 @@ void __folio_batch_release(struct folio_batch *fbatch) lru_add_drain(); fbatch->percpu_pvec_drained = true; } - release_pages(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); diff --git a/mm/swap_state.c b/mm/swap_state.c index 40b84dc47974fc1a03b01529fccd6157e8dc355c..94c9f171e94da3ba52e00c8e66906d1320f7a1ab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -282,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, * folio_free_swap() _with_ the lock. * - Marcelo */ -void free_swap_cache(struct page *page) +void free_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); @@ -299,9 +298,11 @@ void free_swap_cache(struct page *page) */ void free_page_and_swap_cache(struct page *page) { - free_swap_cache(page); + struct folio *folio = page_folio(page); + + free_swap_cache(folio); if (!is_huge_zero_page(page)) - put_page(page); + folio_put(folio); } /* @@ -310,21 +311,25 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + lru_add_drain(); + folio_batch_init(&folios); for (int i = 0; i < nr; i++) { - struct page *page = encoded_page_ptr(pages[i]); + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); - /* - * Skip over the "nr_pages" entry. It's sufficient to call - * free_swap_cache() only once per folio. - */ + free_swap_cache(folio); + refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - i++; + refs[folios.nr] = encoded_nr_pages(pages[++i]); - free_swap_cache(page); + if (folio_batch_add(&folios, folio) == 0) + folios_put_refs(&folios, refs); } - release_pages(pages, nr); + if (folios.nr) + folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 34614bb7062dbf5df61248085b5bdef2cd6c78ff..44154c63ec6c5d1655286fd99453ac549a608c9e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1711,14 +1711,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { + struct folio_batch free_folios; LIST_HEAD(ret_folios); - LIST_HEAD(free_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; + folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -2134,14 +2135,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ nr_reclaimed += nr_pages; - /* - * Is there need to periodically free_folio_list? It would - * appear not as the counts should be low - */ - if (unlikely(folio_test_large(folio))) - destroy_large_folio(folio); - else - list_add(&folio->lru, &free_folios); + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; activate_locked_split: @@ -2205,9 +2206,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_folios); + mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_folios); + free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); @@ -2506,7 +2507,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. - * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -2514,8 +2514,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(folios_to_free); + struct folio_batch free_folios; + folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -2544,12 +2545,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (unlikely(folio_test_large(folio))) { + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); - destroy_large_folio(folio); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); - } else - list_add(&folio->lru, &folios_to_free); + } continue; } @@ -2566,10 +2570,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&folios_to_free, list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } return nr_moved; } @@ -2648,8 +2654,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - mem_cgroup_uncharge_list(&folio_list); - free_unref_page_list(&folio_list); /* * If dirty folios are scanned that are not queued for IO, it @@ -2790,8 +2794,6 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - /* Keep all free folios in l_active list */ - list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); @@ -2801,8 +2803,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); - mem_cgroup_uncharge_list(&l_active); - free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -5291,10 +5291,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_unlock_irq(&lruvec->lru_lock); - mem_cgroup_uncharge_list(&list); - free_unref_page_list(&list); - - INIT_LIST_HEAD(&list); list_splice_init(&clean, &list); if (!list_empty(&list)) {