diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage new file mode 100644 index 0000000000000000000000000000000000000000..7bfbb9cc2c11301b1c163a80c819669039ba2b32 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage @@ -0,0 +1,18 @@ +What: /sys/kernel/mm/transparent_hugepage/ +Date: April 2024 +Contact: Linux memory management mailing list +Description: + /sys/kernel/mm/transparent_hugepage/ contains a number of files and + subdirectories, + + - defrag + - enabled + - hpage_pmd_size + - khugepaged + - shmem_enabled + - use_zero_page + - subdirectories of the form hugepages-kB, where + is the page size of the hugepages supported by the kernel/CPU + combination. + + See Documentation/admin-guide/mm/transhuge.rst for details. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 936da10c526055c29ff7affd91ce5f83a8c28873..54046f487f155827c48c41542c53fcf3428715b8 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -203,11 +203,31 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size The kernel tries to use huge, PMD-mappable page on read page fault for -file exec mapping if CONFIG_READ_ONLY_THP_FOR_FS enabled. It's possible -to enabled the feature by writing 1 or disablt by writing 0:: +if CONFIG_READ_ONLY_THP_FOR_FS enabled, or try non-PMD size page(eg, +64K arm64) for file exec mapping, BIT0 for PMD THP, BIT1 for mTHP. It's +possible to enable/disable it by configurate the corresponding bit:: - echo 0x0 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled echo 0x1 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled + echo 0x2 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled + echo 0x3 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled + +The kernel could try to enable other larger size mappings align other +than THP size, eg, 64K on arm64, BIT0 for file mapping, BIT1 for anon +mapping, it is disabled by default, and could enable this feature by +writing the corresponding bit to 1:: + + echo 0x1 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align + echo 0x2 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align + echo 0x3 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align + +The kernel could enable high-orders(greated than PAGE_ALLOC_COSTLY_ORDER, only +support order 4 for now) be stored on PCP lists(except PMD order), which could +reduce the zone lock contention when allocate hige-order pages frequently. It +is possible to enable order 4 pages stored on PCP lists by writing 4 or disable +it back by writing 0:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/pcp_allow_high_order + echo 4 >/sys/kernel/mm/transparent_hugepage/pcp_allow_high_order khugepaged will be automatically started when one or more hugepage sizes are enabled (either by directly setting "always" or "madvise", @@ -376,7 +396,7 @@ monitor how successfully the system is providing huge pages for use. thp_fault_alloc is incremented every time a huge page is successfully - allocated to handle a page fault. + allocated and charged to handle a page fault. thp_collapse_alloc is incremented by khugepaged when it has found @@ -384,7 +404,7 @@ thp_collapse_alloc successfully allocated a new huge page to store the data. thp_fault_fallback - is incremented if a page fault fails to allocate + is incremented if a page fault fails to allocate or charge a huge page and instead falls back to using small pages. thp_fault_fallback_charge @@ -454,6 +474,34 @@ thp_swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. +In /sys/kernel/mm/transparent_hugepage/hugepages-kB/stats, There are +also individual counters for each huge page size, which can be utilized to +monitor the system's effectiveness in providing huge pages for usage. Each +counter has its own corresponding file. + +anon_fault_alloc + is incremented every time a huge page is successfully + allocated and charged to handle a page fault. + +anon_fault_fallback + is incremented if a page fault fails to allocate or charge + a huge page and instead falls back to using huge pages with + lower orders or small pages. + +anon_fault_fallback_charge + is incremented if a page fault fails to charge a huge page and + instead falls back to using huge pages with lower orders or + small pages even though the allocation was successful. + +anon_swpout + is incremented every time a huge page is swapped out in one + piece without splitting. + +anon_swpout_fallback + is incremented if a huge page has to be split before swapout. + Usually because failed to allocate some continuous swap space + for the huge page. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 07948fe59b9d98fb7051818178d583c2d17d3fca..8d68d00de0a4dc903f74e10ee373a686951cd98c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1147,6 +1147,18 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, */ #define arch_wants_old_prefaulted_pte cpu_has_hw_af +/* + * Request exec memory is read into pagecache in at least 64K folios. The + * trade-off here is performance improvement due to storing translations more + * effciently in the iTLB vs the potential for read amplification due to reading + * data from disk that won't be used. The latter is independent of base page + * size, so we set a page-size independent block size of 64K. This size can be + * contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB entry), + * and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base pages are in + * use. + */ +#define arch_wants_exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT) + static inline bool pud_sect_supported(void) { return PAGE_SIZE == SZ_4K; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b18b7e3758be485ad9d9b53df6337253b093e179..b2d4f45a866b96b19aff3ae5800f0a482b2adc50 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -335,6 +335,7 @@ extern void page_frag_free(void *addr); void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +void drain_all_zone_pages(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index abf2340a2d18a646fa37c14c86d63a9fb74b1d71..3e5f7064e2de9a9f95c4e05a47c7938a2dde0033 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -51,6 +51,9 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG, + TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG, + TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG, + TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG, }; struct kobject; @@ -101,6 +104,7 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_pcp_allow_orders; static inline bool hugepage_global_enabled(void) { @@ -257,6 +261,29 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, enforce_sysfs, orders); } +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_ANON_SWPOUT, + MTHP_STAT_ANON_SWPOUT_FALLBACK, + __MTHP_STAT_COUNT +}; + +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_inc(mthp_stats.stats[order][item]); +} + #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<page, idx, val); -} +#endif /* CONFIG_MEMCG */ static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) @@ -627,24 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, 1); -} - -static inline void dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, -1); -} - -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { diff --git a/mm/filemap.c b/mm/filemap.c index a274d2c5e232f97671a78af6135737675546e33b..d3c813429bf212487d30f74f62729fe192784b6b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -3141,6 +3142,10 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, (transparent_hugepage_flags & \ (1<= 0) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ra->size = 1UL << order; + ra->async_size = 0; + ractl._index &= ~((unsigned long)ra->size - 1); + page_cache_ra_order(&ractl, ra, order); + return fpin; + } + } #endif /* If we don't want any read-ahead, don't bother */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c61e7c7c2c13cc881455463dbdf35e0711f90e3..763bb25e4f9934a2399bee190c8698b341221121 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -74,6 +74,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_pcp_allow_orders __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, @@ -417,6 +418,35 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); +static ssize_t pcp_allow_high_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", READ_ONCE(huge_pcp_allow_orders)); +} +static ssize_t pcp_allow_high_order_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + + /* Only enable order 4 now, 0 is to disable it */ + if (value != 0 && value != (PAGE_ALLOC_COSTLY_ORDER + 1)) + return -EINVAL; + + if (value == 0) + drain_all_zone_pages(); + + WRITE_ONCE(huge_pcp_allow_orders, value); + + return count; +} +static struct kobj_attribute pcp_allow_high_order_attr = + __ATTR_RW(pcp_allow_high_order); + static ssize_t hpage_pmd_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -426,42 +456,118 @@ static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); #ifdef CONFIG_READ_ONLY_THP_FOR_FS +#define FILE_EXEC_THP_ENABLE BIT(0) +#else +#define FILE_EXEC_THP_ENABLE 0 +#endif + +#define FILE_EXEC_MTHP_ENABLE BIT(1) +#define FILE_EXEC_THP_ALL (FILE_EXEC_THP_ENABLE | FILE_EXEC_MTHP_ENABLE) + +static void thp_flag_set(enum transparent_hugepage_flag flag, bool enable) +{ + if (enable) + set_bit(flag, &transparent_hugepage_flags); + else + clear_bit(flag, &transparent_hugepage_flags); +} + static ssize_t thp_exec_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG); + unsigned long val = 0; + +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (test_bit(TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG, + &transparent_hugepage_flags)) + val |= FILE_EXEC_THP_ENABLE; +#endif + + if (test_bit(TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG, + &transparent_hugepage_flags)) + val |= FILE_EXEC_MTHP_ENABLE; + + return sysfs_emit(buf, "0x%lx\n", val); } static ssize_t thp_exec_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - size_t ret = single_hugepage_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG); - if (ret > 0) { - int err = start_stop_khugepaged(); + unsigned long val; + int ret; - if (err) - ret = err; - } + ret = kstrtoul(buf, 16, &val); + if (ret < 0) + return ret; + if (val & ~FILE_EXEC_THP_ALL) + return -EINVAL; - return ret; +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG, + val & FILE_EXEC_THP_ENABLE); + ret = start_stop_khugepaged(); + if (ret) + return ret; +#endif + thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG, + val & FILE_EXEC_MTHP_ENABLE); + + return count; } static struct kobj_attribute thp_exec_enabled_attr = __ATTR_RW(thp_exec_enabled); -#endif +#define FILE_MAPPING_ALIGN BIT(0) +#define ANON_MAPPING_ALIGN BIT(1) +#define THP_MAPPING_ALIGN_ALL (FILE_MAPPING_ALIGN | ANON_MAPPING_ALIGN) + +static ssize_t thp_mapping_align_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned long val = 0; + + if (test_bit(TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG, + &transparent_hugepage_flags)) + val |= FILE_MAPPING_ALIGN; + + if (test_bit(TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG, + &transparent_hugepage_flags)) + val |= ANON_MAPPING_ALIGN; + + return sysfs_emit(buf, "0x%lx\n", val); +} +static ssize_t thp_mapping_align_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + unsigned long val; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret < 0) + return ret; + if (val & ~THP_MAPPING_ALIGN_ALL) + return -EINVAL; + + thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG, + val & FILE_MAPPING_ALIGN); + thp_flag_set(TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG, + val & ANON_MAPPING_ALIGN); + + return count; +} +static struct kobj_attribute thp_mapping_align_attr = + __ATTR_RW(thp_mapping_align); static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &pcp_allow_high_order_attr.attr, &hpage_pmd_size_attr.attr, #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif -#ifdef CONFIG_READ_ONLY_THP_FOR_FS &thp_exec_enabled_attr.attr, -#endif + &thp_mapping_align_attr.attr, NULL, }; @@ -554,6 +660,52 @@ static const struct kobj_type thpsize_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; + +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct mthp_stat *this = &per_cpu(mthp_stats, cpu); + + sum += this->stats[order][item]; + } + + return sum; +} + +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + int order = to_thpsize(kobj)->order; \ + \ + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ +} \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); +DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); + +static struct attribute *stats_attrs[] = { + &anon_fault_alloc_attr.attr, + &anon_fault_fallback_attr.attr, + &anon_fault_fallback_charge_attr.attr, + &anon_swpout_attr.attr, + &anon_swpout_fallback_attr.attr, + NULL, +}; + +static struct attribute_group stats_attr_group = { + .name = "stats", + .attrs = stats_attrs, +}; + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; @@ -577,6 +729,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) return ERR_PTR(ret); } + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + thpsize->order = order; return thpsize; } @@ -853,6 +1011,66 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, return ret; } +#define thp_file_mapping_align_enabled() \ + (transparent_hugepage_flags & \ + (1<f_mapping; + if (!mapping || !mapping_large_folio_support(mapping)) + return false; + + return true; +} + +static bool anon_mapping_align_enabled(int order) +{ + unsigned long mask; + + if (!thp_anon_mapping_align_enabled()) + return 0; + + mask = READ_ONCE(huge_anon_orders_always) | + READ_ONCE(huge_anon_orders_madvise); + + if (hugepage_global_enabled()) + mask |= READ_ONCE(huge_anon_orders_inherit); + + mask = BIT(order) & mask; + if (!mask) + return false; + + return true; +} + +static unsigned long folio_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + int order = arch_wants_exec_folio_order(); + + if (order < 0) + return 0; + + if (file_mapping_align_enabled(filp) || + (!filp && anon_mapping_align_enabled(order))) + return __thp_get_unmapped_area(filp, addr, len, pgoff, flags, + PAGE_SIZE << order); + return 0; +} + unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -863,6 +1081,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, if (ret) return ret; + ret = folio_get_unmapped_area(filp, addr, len, off, flags); + if (ret) + return ret; + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); @@ -882,6 +1104,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); @@ -932,6 +1156,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } @@ -1052,6 +1277,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9a9f1ebe6dd7c3aced7e7c3ddd2e17984e9d4938..7d329e9eeec85b6a5186d0c792d4f894255ff98b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2147,23 +2147,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); } - nr = thp_nr_pages(hpage); + folio = page_folio(hpage); + nr = folio_nr_pages(folio); if (is_shmem) - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); else - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); if (nr_none) { - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none); } /* * Mark hpage as uptodate before inserting it into the page cache so * that it isn't mistaken for an fallocated but unwritten page. */ - folio = page_folio(hpage); folio_mark_uptodate(folio); folio_ref_add(folio, HPAGE_PMD_NR - 1); @@ -2173,7 +2173,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, hpage); + xas_store(&xas, folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); @@ -2184,7 +2184,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; - unlock_page(hpage); + folio_unlock(folio); /* * The collapse has succeeded, so free the old pages. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1cf73835cba6ef592492915f0dfd5403fa8f078..ff64d2c36749edbd0daa6639397271747b8eab08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -897,16 +897,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; - pg_data_t *pgdat = page_pgdat(page); + pg_data_t *pgdat = folio_pgdat(folio); struct lruvec *lruvec; rcu_read_lock(); - memcg = page_memcg(head); + memcg = folio_memcg(folio); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); @@ -918,7 +917,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, __mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__mod_lruvec_page_state); +EXPORT_SYMBOL(__lruvec_stat_mod_folio); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { diff --git a/mm/memory.c b/mm/memory.c index 64e1fd144d9380c02613962e41a42ed22423ed85..4ef917a182f9f49c41d24a0b4df2c2a94b538b02 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4349,6 +4349,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } @@ -4357,6 +4358,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) return folio; } next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } @@ -4465,6 +4467,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif add_reliable_folio_counter(folio, vma->vm_mm, nr_pages); folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4652dc4539646edb2bcf57c29c7b4d24907e8c0f..f225f412e71d2de5531628a7c912b13c57f8cda8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -528,7 +528,7 @@ static void bad_page(struct page *page, const char *reason) static inline unsigned int order_to_pindex(int migratetype, int order) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (order > PAGE_ALLOC_COSTLY_ORDER + 1) { VM_BUG_ON(order != HPAGE_PMD_ORDER); return NR_LOWORDER_PCP_LISTS; } @@ -560,6 +560,8 @@ static inline bool pcp_allowed_order(unsigned int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order == HPAGE_PMD_ORDER) return true; + if (order == READ_ONCE(huge_pcp_allow_orders)) + return true; #endif return false; } @@ -6829,6 +6831,20 @@ void zone_pcp_reset(struct zone *zone) } } +void drain_all_zone_pages(void) +{ + struct zone *zone; + + mutex_lock(&pcp_batch_high_lock); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); + __drain_all_pages(NULL, true); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); + mutex_unlock(&pcp_batch_high_lock); +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes, diff --git a/mm/page_io.c b/mm/page_io.c index ea8d57b9b3ae52507c4ba519ce90833ce75b6970..80e49e536d37831968abf4d3b69455884832aa00 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -212,6 +212,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e9cbb2bc454df47adadbe987c54487f2f675291..6cf8dd04d13ad29f8804a8ed0d907950def264eb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1911,6 +1911,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, goto activate_locked; } if (!add_to_swap(folio)) { + int __maybe_unused order = folio_order(folio); + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ @@ -1922,6 +1924,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } + count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split;