From 634ee64237e4c2888e75f3ee016712352b1678c9 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 15 Jan 2024 11:25:22 +0100 Subject: [PATCH 1/7] readahead: use ilog2 instead of a while loop in page_cache_ra_order() ANBZ: #9728 commit e03c16fb4af1dfc615a4e1f51be0d5fe5840b904 upstream A while loop is used to adjust the new_order to be lower than the ra->size. ilog2 could be used to do the same instead of using a loop. ilog2 typically resolves to a bit scan reverse instruction. This is particularly useful when ra->size is smaller than the 2^new_order as it resolves in one instruction instead of looping to find the new_order. No functional changes. Link: https://lkml.kernel.org/r/20240115102523.2336742-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Baolin Wang --- mm/readahead.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index fb5281d0c9ac..9f8216ade368 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -505,10 +505,8 @@ void page_cache_ra_order(struct readahead_control *ractl, if (new_order < MAX_PAGECACHE_ORDER) new_order += 2; - if (new_order > MAX_PAGECACHE_ORDER) - new_order = MAX_PAGECACHE_ORDER; - while ((1 << new_order) > ra->size) - new_order--; + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); -- Gitee From bce49792b45a67ced9950bec6b63d54ca2193304 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 15:05:46 +0800 Subject: [PATCH 2/7] mm: mTHP user controls to configure pagecache large folio sizes ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#m25b51aa890b123202cda93fa0e67340b3e4b26b6 Add mTHP controls to sysfs to allow user space to configure the folio sizes that can be considered for allocation of file-backed memory: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enable For now, the control can be set to either `always` or `never` to enable or disable that size. More options may be added in future. By default, at boot, all folio sizes are enabled, and the algorithm used to select a folio size remains conceptually unchanged; increase by 2 enabled orders each time a readahead marker is hit then reduce to the closest enabled order to fit within bounds of ra size, index alignment and EOF. So when all folio sizes are enabled, behavior should be unchanged. When folio sizes are disabled, the algorithm will never select them. Systems such as Android are always under extreme memory pressure and as a result fragmentation often causes attempts to allocate large folios to fail and fallback to smaller folios. By fixing the pagecache to one large folio size (e.g. 64K) plus fallback to small folios, a large source of this fragmentation can be removed and 64K mTHP allocations succeed more often, allowing the system to benefit from improved performance on arm64 and other arches that support "contpte". Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang --- Documentation/admin-guide/mm/transhuge.rst | 21 +++++++++++ include/linux/huge_mm.h | 42 ++++++++++++--------- mm/filemap.c | 15 +++++--- mm/huge_memory.c | 43 ++++++++++++++++++++++ mm/readahead.c | 43 ++++++++++++++++++---- 5 files changed, 134 insertions(+), 30 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index c05cbfe1318c..f6b3aa8e2df1 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -283,6 +283,27 @@ processes. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +File-Backed Hugepages +--------------------- + +The kernel will automatically select an appropriate THP size for file-backed +memory from a set of allowed sizes. By default all THP sizes that the page cache +supports are allowed, but this set can be modified with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + +where is the hugepage size being addressed, the available sizes for which +vary by system. ``always`` adds the hugepage size to the set of allowed sizes, +and ``never`` removes the hugepage size from the set of allowed sizes. + +In some situations, constraining the allowed sizes can reduce memory +fragmentation, resulting in fewer allocation fallbacks and improved system +performance. + +Note that any changes to the allowed set of sizes only applies to future +file-backed THP allocations. + Boot parameters =============== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f6c139d2edf9..11eaca1a5d90 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -95,6 +95,24 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +static inline int lowest_order(unsigned long orders) +{ + if (orders) + return __ffs(orders); + return -1; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, @@ -155,6 +173,12 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_file_orders_always; + +static inline unsigned long file_orders_always(void) +{ + return READ_ONCE(huge_file_orders_always); +} static inline bool hugepage_global_enabled(void) { @@ -169,17 +193,6 @@ static inline bool hugepage_global_always(void) (1< MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + + orders = file_orders_always() | BIT(0); + orders &= BIT(order + 1) - 1; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + orders &= BIT(__ffs(index) + 1) - 1; + order = highest_order(orders); - do { + while (orders) { gfp_t alloc_gfp = gfp; err = -ENOMEM; @@ -2000,7 +2003,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, break; folio_put(folio); folio = NULL; - } while (order-- > 0); + + order = next_order(&orders, order); + }; if (err == -EEXIST) goto repeat; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 61f18d588806..902e33499660 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -75,6 +75,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_file_orders_always __read_mostly; static bool anon_orders_configured __initdata; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, @@ -506,6 +507,37 @@ static ssize_t anon_enabled_store(struct kobject *kobj, return ret; } +static ssize_t file_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_file_orders_always)) + output = "[always] never"; + else + output = "always [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t file_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) + set_bit(order, &huge_file_orders_always); + else if (sysfs_streq(buf, "never")) + clear_bit(order, &huge_file_orders_always); + else + ret = -EINVAL; + + return ret; +} + static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); @@ -518,7 +550,11 @@ static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; +static struct kobj_attribute file_enabled_attr = + __ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store); + static struct attribute *file_ctrl_attrs[] = { + &file_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif @@ -725,6 +761,13 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) if (!anon_orders_configured) huge_anon_orders_inherit = BIT(PMD_ORDER); + /* + * For pagecache, default to enabling all orders. powerpc's PMD_ORDER + * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time + * constant so we have to do this here. + */ + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); diff --git a/mm/readahead.c b/mm/readahead.c index 9f8216ade368..5d300fbd1eba 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -486,6 +486,34 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return 0; } +static int select_new_order(int old_order, int max_order, unsigned long orders) +{ + unsigned long hi_orders, lo_orders; + + /* + * Select the next order to use from the set in `orders`, while ensuring + * we don't go above max_order. Prefer the next + 1 highest allowed + * order after old_order, unless there isn't one, in which case return + * the closest allowed order, which is either the next highest allowed + * order or less than or equal to old_order. The "next + 1" skip + * behaviour is intended to allow ramping up to large folios quickly. + */ + + orders &= BIT(max_order + 1) - 1; + VM_WARN_ON(!orders); + hi_orders = orders & ~(BIT(old_order + 1) - 1); + + if (hi_orders) { + old_order = lowest_order(hi_orders); + hi_orders &= ~BIT(old_order); + if (hi_orders) + return lowest_order(hi_orders); + } + + lo_orders = orders & (BIT(old_order + 1) - 1); + return highest_order(lo_orders); +} + void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { @@ -496,17 +524,15 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); + unsigned long orders; - if (!mapping_large_folio_support(mapping) || ra->size < 4) + if (!mapping_large_folio_support(mapping)) goto fallback; limit = min(limit, index + ra->size - 1); - if (new_order < MAX_PAGECACHE_ORDER) - new_order += 2; - - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + orders = file_orders_always() | BIT(0); + new_order = select_new_order(new_order, ilog2(ra->size), orders); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); @@ -516,9 +542,10 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) + while (index + (1UL << order) - 1 > limit && + (BIT(order) & orders) == 0) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) -- Gitee From ae6393305df6bf21e4ec9491f3705c071adebfd9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 15:56:21 +0800 Subject: [PATCH 3/7] mm: Introduce "always+exec" for mTHP file_enabled control ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#md06a4a7a606cb90824f322fec868ee0d7620a876 In addition to `always` and `never`, add `always+exec` as an option for: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/file_enabled `always+exec` acts like `always` but additionally marks the hugepage size as the preferred hugepage size for sections of any file mapped with execute permission. A maximum of one hugepage size can be marked as `exec` at a time, so applying it to a new size implicitly removes it from any size it was previously set for. Change readahead to use this flagged exec size; when a request is made for an executable mapping, do a synchronous read of the size in a naturally aligned manner. On arm64 if memory is physically contiguous and naturally aligned to the "contpte" size, we can use contpte mappings, which improves utilization of the TLB. When paired with the "multi-size THP" changes, this works well to reduce dTLB pressure. However iTLB pressure is still high due to executable mappings having a low liklihood of being in the required folio size and mapping alignment, even when the filesystem supports readahead into large folios (e.g. XFS). The reason for the low liklihood is that the current readahead algorithm starts with an order-2 folio and increases the folio order by 2 every time the readahead mark is hit. But most executable memory is faulted in fairly randomly and so the readahead mark is rarely hit and most executable folios remain order-2. This is observed impirically and confirmed from discussion with a gnu linker expert; in general, the linker does nothing to group temporally accessed text together spacially. Additionally, with the current read-around approach there are no alignment guarrantees between the file and folio. This is insufficient for arm64's contpte mapping requirement (order-4 for 4K base pages). So it seems reasonable to special-case the read(ahead) logic for executable mappings. The trade-off is performance improvement (due to more efficient storage of the translations in iTLB) vs potential read amplification (due to reading too much data around the fault which won't be used), and the latter is independent of base page size. Of course if no hugepage size is marked as `always+exec` the old behaviour is maintained. Performance Benchmarking ------------------------ The below shows kernel compilation and speedometer javascript benchmarks on Ampere Altra arm64 system. When the patch is applied, `always+exec` is set for 64K folios. First, confirmation that this patch causes more memory to be contained in 64K folios (this is for all file-backed memory so includes non-executable too): | File-backed folios | Speedometer | Kernel Compile | | by size as percentage |-----------------|-----------------| | of all mapped file mem | before | after | before | after | |=========================|========|========|========|========| |file-thp-aligned-16kB | 45% | 9% | 46% | 7% | |file-thp-aligned-32kB | 2% | 0% | 3% | 1% | |file-thp-aligned-64kB | 3% | 63% | 5% | 80% | |file-thp-aligned-128kB | 11% | 11% | 0% | 0% | |file-thp-unaligned-16kB | 1% | 0% | 3% | 1% | |file-thp-unaligned-128kB | 1% | 0% | 0% | 0% | |file-thp-partial | 0% | 0% | 0% | 0% | |-------------------------|--------|--------|--------|--------| |file-cont-aligned-64kB | 16% | 75% | 5% | 80% | The above shows that for both use cases, the amount of file memory backed by 16K folios reduces and the amount backed by 64K folios increases significantly. And the amount of memory that is contpte-mapped significantly increases (last line). And this is reflected in performance improvement: Kernel Compilation (smaller is faster): | kernel | real-time | kern-time | user-time | peak memory | |----------|-------------|-------------|-------------|---------------| | before | 0.0% | 0.0% | 0.0% | 0.0% | | after | -1.6% | -2.1% | -1.7% | 0.0% | Speedometer (bigger is faster): | kernel | runs_per_min | peak memory | |----------|----------------|---------------| | before | 0.0% | 0.0% | | after | 1.3% | 1.0% | Both benchmarks show a ~1.5% improvement once the patch is applied. Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang --- Documentation/admin-guide/mm/transhuge.rst | 6 +++++ include/linux/huge_mm.h | 11 ++++++++ mm/filemap.c | 11 ++++++++ mm/huge_memory.c | 31 +++++++++++++++++----- 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index f6b3aa8e2df1..25021a6ec9d5 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -291,12 +291,18 @@ memory from a set of allowed sizes. By default all THP sizes that the page cache supports are allowed, but this set can be modified with one of:: echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo always+exec >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled where is the hugepage size being addressed, the available sizes for which vary by system. ``always`` adds the hugepage size to the set of allowed sizes, and ``never`` removes the hugepage size from the set of allowed sizes. +``always+exec`` acts like ``always`` but additionally marks the hugepage size as +the preferred hugepage size for sections of any file mapped executable. A +maximum of one hugepage size can be marked as ``exec`` at a time, so applying it +to a new size implicitly removes it from any size it was previously set for. + In some situations, constraining the allowed sizes can reduce memory fragmentation, resulting in fewer allocation fallbacks and improved system performance. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 11eaca1a5d90..9633047656e9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -174,12 +174,18 @@ extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; extern unsigned long huge_file_orders_always; +extern int huge_file_exec_order; static inline unsigned long file_orders_always(void) { return READ_ONCE(huge_file_orders_always); } +static inline int file_exec_order(void) +{ + return READ_ONCE(huge_file_exec_order); +} + static inline bool hugepage_global_enabled(void) { return transparent_hugepage_flags & @@ -605,6 +611,11 @@ static inline unsigned long file_orders_always(void) { return 0; } + +static inline int file_exec_order(void) +{ + return -1; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list(struct folio *folio, diff --git a/mm/filemap.c b/mm/filemap.c index e89c36f2ca06..c4b6715b1bd0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3214,6 +3214,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *fpin = NULL; unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; + int exec_order = file_exec_order(); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ @@ -3233,6 +3234,16 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } #endif + /* If explicit order is set for exec mappings, use it. */ + if ((vm_flags & VM_EXEC) && exec_order >= 0) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ra->size = 1UL << exec_order; + ra->async_size = 0; + ractl._index &= ~((unsigned long)ra->size - 1); + page_cache_ra_order(&ractl, ra, exec_order); + return fpin; + } + /* If we don't want any read-ahead, don't bother */ if (vm_flags & VM_RAND_READ) return fpin; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 902e33499660..cdfa964bfbbd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -76,6 +76,7 @@ unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long huge_file_orders_always __read_mostly; +int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, @@ -443,6 +444,7 @@ static const struct attribute_group hugepage_attr_group = { static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); +static DEFINE_SPINLOCK(huge_file_orders_lock); static LIST_HEAD(thpsize_list); static ssize_t anon_enabled_show(struct kobject *kobj, @@ -512,11 +514,15 @@ static ssize_t file_enabled_show(struct kobject *kobj, { int order = to_thpsize(kobj)->order; const char *output; + bool exec; - if (test_bit(order, &huge_file_orders_always)) - output = "[always] never"; - else - output = "always [never]"; + if (test_bit(order, &huge_file_orders_always)) { + exec = READ_ONCE(huge_file_exec_order) == order; + output = exec ? "always [always+exec] never" : + "[always] always+exec never"; + } else { + output = "always always+exec [never]"; + } return sysfs_emit(buf, "%s\n", output); } @@ -528,13 +534,24 @@ static ssize_t file_enabled_store(struct kobject *kobj, int order = to_thpsize(kobj)->order; ssize_t ret = count; - if (sysfs_streq(buf, "always")) + spin_lock(&huge_file_orders_lock); + + if (sysfs_streq(buf, "always")) { set_bit(order, &huge_file_orders_always); - else if (sysfs_streq(buf, "never")) + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else if (sysfs_streq(buf, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + } else if (sysfs_streq(buf, "never")) { clear_bit(order, &huge_file_orders_always); - else + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else { ret = -EINVAL; + } + spin_unlock(&huge_file_orders_lock); return ret; } -- Gitee From d060030c014cbb5d358015641a4b9c9029491ae5 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 25 Dec 2024 16:55:37 +0800 Subject: [PATCH 4/7] mm: Override mTHP "file_enabled" defaults at kernel cmdline ANBZ: #9728 cherry-picked from: https://lore.kernel.org/lkml/20240717071257.4141363-1-ryan.roberts@arm.com/T/#mb70537979115e89c8398c6f2b3d3e70ec438c8d0 Add thp_file= cmdline parameter to allow specifying the default enablement of each supported file-backed THP size. The parameter accepts the following format and can be provided multiple times to configure each size: thp_file=[KMG]: See Documentation/admin-guide/mm/transhuge.rst for more details. Configuring the defaults at boot time is often necessary because its not always possible to drop active executable pages from the page cache, especially if they are well used like libc. The command line parameter allows configuring the values before the first page is installed in the page cache. Signed-off-by: Ryan Roberts Signed-off-by: Baolin Wang --- .../admin-guide/kernel-parameters.txt | 8 ++++ Documentation/admin-guide/mm/transhuge.rst | 13 ++++++ mm/huge_memory.c | 45 ++++++++++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 977b201f52b0..503a55b1d9a7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6436,6 +6436,14 @@ See Documentation/admin-guide/mm/transhuge.rst for more details. + thp_file= [KNL] + Format: [KMG]:always|always+exec|never + Can be used to control the default behavior of the + system with respect to file-backed transparent hugepages. + Can be used multiple times for multiple file-backed THP + sizes. See Documentation/admin-guide/mm/transhuge.rst + for more details. + threadirqs [KNL] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 25021a6ec9d5..9e4375981d91 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -365,6 +365,19 @@ user, the PMD_ORDER hugepage policy will be overridden. If the policy for PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will default to ``never``. +Each supported file-backed THP size can be controlled by passing +``thp_file=[KMG]:``, where ```` is the THP size and +```` is one of ``always``, ``always+exec`` or ``never``. + +For example, the following will set 64K THP to ``always+exec``:: + + thp_file=64K:always+exec + +``thp_file=`` may be specified multiple times to configure all THP sizes as +required. If ``thp_file=`` is specified at least once, any file-backed THP +sizes not explicitly configured on the command line are implicitly set to +``never``. + Hugepages in tmpfs/shmem ======================== diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cdfa964bfbbd..32d6b68c87ec 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -78,6 +78,7 @@ unsigned long huge_anon_orders_inherit __read_mostly; unsigned long huge_file_orders_always __read_mostly; int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; +static bool file_orders_configured; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, @@ -783,7 +784,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time * constant so we have to do this here. */ - huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + if (!file_orders_configured) { + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + file_orders_configured = true; + } *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -1032,6 +1036,45 @@ static int __init setup_thp_anon(char *str) } __setup("thp_anon=", setup_thp_anon); +static int __init setup_thp_file(char *str) +{ + unsigned long size; + char *state; + int order; + int ret = 0; + + if (!str) + goto out; + + size = (unsigned long)memparse(str, &state); + order = ilog2(size >> PAGE_SHIFT); + if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE || + !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT)) + goto out; + + state++; + + if (!strcmp(state, "always")) { + set_bit(order, &huge_file_orders_always); + ret = 1; + } else if (!strcmp(state, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + ret = 1; + } else if (!strcmp(state, "never")) { + clear_bit(order, &huge_file_orders_always); + ret = 1; + } + + if (ret) + file_orders_configured = true; +out: + if (!ret) + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return ret; +} +__setup("thp_file=", setup_thp_file); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) -- Gitee From a84939ce46f4eb83e1e21e15efe39c3972660090 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Dec 2024 15:49:48 +0800 Subject: [PATCH 5/7] anolis: mm: optimize the 'thp_file' cmdline format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #9728 Similar to the ‘thp_anon’ parameter, change the 'thp_file' to support the setting of policies with multiple sizes. Signed-off-by: Baolin Wang --- mm/huge_memory.c | 98 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 28 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 32d6b68c87ec..bf770b231664 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1038,40 +1038,82 @@ __setup("thp_anon=", setup_thp_anon); static int __init setup_thp_file(char *str) { - unsigned long size; - char *state; - int order; - int ret = 0; + char *token, *range, *policy, *subtoken; + unsigned long always; + char *start_size, *end_size; + int start, end, nr, exec; + char *p; - if (!str) - goto out; + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); - size = (unsigned long)memparse(str, &state); - order = ilog2(size >> PAGE_SHIFT); - if (*state != ':' || !is_power_of_2(size) || size <= PAGE_SIZE || - !(BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT)) - goto out; + always = huge_file_orders_always; + exec = huge_file_exec_order; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; - state++; + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; - if (!strcmp(state, "always")) { - set_bit(order, &huge_file_orders_always); - ret = 1; - } else if (!strcmp(state, "always+exec")) { - set_bit(order, &huge_file_orders_always); - huge_file_exec_order = order; - ret = 1; - } else if (!strcmp(state, "never")) { - clear_bit(order, &huge_file_orders_always); - ret = 1; + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + } else if (!strcmp(policy, "always+exec")) { + if (nr != 1) + goto err; + bitmap_set(&always, start, nr); + exec = start; + } else if (!strcmp(policy, "never")) { + bitmap_clear(&always, start, nr); + if (exec != -1 && !test_bit(exec, &always)) + exec = -1; + } else { + pr_err("invalid policy %s in thp_file boot parameter\n", policy); + goto err; + } + } } - if (ret) - file_orders_configured = true; -out: - if (!ret) - pr_warn("thp_file=%s: cannot parse, ignored\n", str); - return ret; + huge_file_orders_always = always; + huge_file_exec_order = exec; + file_orders_configured = true; + return 1; +err: + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return 0; } __setup("thp_file=", setup_thp_file); -- Gitee From a8616833240f422a572f142ff85f93fef897af10 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 25 Dec 2024 17:13:51 +0800 Subject: [PATCH 6/7] anolis: mm: add mTHP counters for file folios ANBZ: #9728 Add mTHP counters for file folios. Signed-off-by: Baolin Wang --- Documentation/admin-guide/mm/transhuge.rst | 4 ++++ include/linux/huge_mm.h | 1 + mm/filemap.c | 8 +++++++- mm/huge_memory.c | 2 ++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 9e4375981d91..a44131c4765e 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -625,6 +625,10 @@ nr_anon_partially_mapped an anonymous THP as "partially mapped" and count it here, even though it is not actually partially mapped anymore. +file_alloc + is incremented every time a file huge page is successfully + allocated. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9633047656e9..5a7100db2956 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -127,6 +127,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + MTHP_STAT_FILE_ALLOC, __MTHP_STAT_COUNT }; diff --git a/mm/filemap.c b/mm/filemap.c index c4b6715b1bd0..f53e5732083b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1003,9 +1003,15 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) folio = __folio_alloc_node(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); + if (folio) + count_mthp_stat(order, MTHP_STAT_FILE_ALLOC); return folio; } - return folio_alloc(gfp, order); + + folio = folio_alloc(gfp, order); + if (folio) + count_mthp_stat(order, MTHP_STAT_FILE_ALLOC); + return folio; } EXPORT_SYMBOL(filemap_alloc_folio); #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bf770b231664..7611126e04a1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -637,6 +637,7 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -658,6 +659,7 @@ static struct attribute_group anon_stats_attr_grp = { }; static struct attribute *file_stats_attrs[] = { + &file_alloc_attr.attr, #ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, -- Gitee From 86f8f91c0f6da00297d8dea04640bdffa9e397ad Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Mon, 30 Dec 2024 14:21:39 +0800 Subject: [PATCH 7/7] anolis: mm, thp: hugetext: make PIC binary mapping address THP align ANBZ: #9728 The patch mainly to make mmap address of PIC binary is aligned with HPAGE_PMD_SIZE. If not so, the ELF binary that is generated with -fPIC compile option can not use hugepages, because of the mapping address is randomly selected by kernel. Note: Baolin Wang changed the code to make it suitable for the file mTHP. Signed-off-by: Rongwei Wang Signed-off-by: Baolin Wang --- fs/binfmt_elf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb2c8d14327a..9016f46f98ab 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1037,6 +1037,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; + int exec_order = file_exec_order(); if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1161,6 +1162,10 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -EINVAL; goto out_free_dentry; } + + if (exec_order > 0 && interpreter && + total_size >= (PAGE_SIZE << exec_order)) + load_bias &= ~((PAGE_SIZE << exec_order) - 1); } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, -- Gitee