diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 977b201f52b0be83c3ab4dd4648b2a7ed01aad0c..503a55b1d9a7e0987c19a2c6114e3e14fa8e1a0e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6436,6 +6436,14 @@ See Documentation/admin-guide/mm/transhuge.rst for more details. + thp_file= [KNL] + Format: [KMG]:always|always+exec|never + Can be used to control the default behavior of the + system with respect to file-backed transparent hugepages. + Can be used multiple times for multiple file-backed THP + sizes. See Documentation/admin-guide/mm/transhuge.rst + for more details. + threadirqs [KNL] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index c05cbfe1318c7360bb1ac20b7ea40b010ebba89e..a44131c4765e0f3ddf74dcfd1fd9e7c4b7e8f5a8 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -283,6 +283,33 @@ processes. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +File-Backed Hugepages +--------------------- + +The kernel will automatically select an appropriate THP size for file-backed +memory from a set of allowed sizes. By default all THP sizes that the page cache +supports are allowed, but this set can be modified with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo always+exec >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/file_enabled + +where is the hugepage size being addressed, the available sizes for which +vary by system. ``always`` adds the hugepage size to the set of allowed sizes, +and ``never`` removes the hugepage size from the set of allowed sizes. + +``always+exec`` acts like ``always`` but additionally marks the hugepage size as +the preferred hugepage size for sections of any file mapped executable. A +maximum of one hugepage size can be marked as ``exec`` at a time, so applying it +to a new size implicitly removes it from any size it was previously set for. + +In some situations, constraining the allowed sizes can reduce memory +fragmentation, resulting in fewer allocation fallbacks and improved system +performance. + +Note that any changes to the allowed set of sizes only applies to future +file-backed THP allocations. + Boot parameters =============== @@ -338,6 +365,19 @@ user, the PMD_ORDER hugepage policy will be overridden. If the policy for PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will default to ``never``. +Each supported file-backed THP size can be controlled by passing +``thp_file=[KMG]:``, where ```` is the THP size and +```` is one of ``always``, ``always+exec`` or ``never``. + +For example, the following will set 64K THP to ``always+exec``:: + + thp_file=64K:always+exec + +``thp_file=`` may be specified multiple times to configure all THP sizes as +required. If ``thp_file=`` is specified at least once, any file-backed THP +sizes not explicitly configured on the command line are implicitly set to +``never``. + Hugepages in tmpfs/shmem ======================== @@ -585,6 +625,10 @@ nr_anon_partially_mapped an anonymous THP as "partially mapped" and count it here, even though it is not actually partially mapped anymore. +file_alloc + is incremented every time a file huge page is successfully + allocated. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb2c8d14327ae160d0ac97b49010a7e889671363..9016f46f98ab23a1fc7b89369f2311650011ac46 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1037,6 +1037,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; + int exec_order = file_exec_order(); if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1161,6 +1162,10 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -EINVAL; goto out_free_dentry; } + + if (exec_order > 0 && interpreter && + total_size >= (PAGE_SIZE << exec_order)) + load_bias &= ~((PAGE_SIZE << exec_order) - 1); } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f6c139d2edf9769957c6354214f047ae14a9878a..5a7100db2956927bd8b4a8aa93a8a609a9da79e9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -95,6 +95,24 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +static inline int lowest_order(unsigned long orders) +{ + if (orders) + return __ffs(orders); + return -1; +} + +static inline int highest_order(unsigned long orders) +{ + return fls_long(orders) - 1; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + *orders &= ~BIT(prev); + return highest_order(*orders); +} + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, @@ -109,6 +127,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + MTHP_STAT_FILE_ALLOC, __MTHP_STAT_COUNT }; @@ -155,6 +174,18 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_file_orders_always; +extern int huge_file_exec_order; + +static inline unsigned long file_orders_always(void) +{ + return READ_ONCE(huge_file_orders_always); +} + +static inline int file_exec_order(void) +{ + return READ_ONCE(huge_file_exec_order); +} static inline bool hugepage_global_enabled(void) { @@ -169,17 +200,6 @@ static inline bool hugepage_global_always(void) (1< MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + + orders = file_orders_always() | BIT(0); + orders &= BIT(order + 1) - 1; /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + orders &= BIT(__ffs(index) + 1) - 1; + order = highest_order(orders); - do { + while (orders) { gfp_t alloc_gfp = gfp; err = -ENOMEM; @@ -2000,7 +2009,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, break; folio_put(folio); folio = NULL; - } while (order-- > 0); + + order = next_order(&orders, order); + }; if (err == -EEXIST) goto repeat; @@ -3209,6 +3220,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *fpin = NULL; unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; + int exec_order = file_exec_order(); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ @@ -3228,6 +3240,16 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } #endif + /* If explicit order is set for exec mappings, use it. */ + if ((vm_flags & VM_EXEC) && exec_order >= 0) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ra->size = 1UL << exec_order; + ra->async_size = 0; + ractl._index &= ~((unsigned long)ra->size - 1); + page_cache_ra_order(&ractl, ra, exec_order); + return fpin; + } + /* If we don't want any read-ahead, don't bother */ if (vm_flags & VM_RAND_READ) return fpin; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 61f18d58880602b4ae49f9e468ef855980d43e5e..7611126e04a1c62ecbf88a21199baf9bb1665f72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -75,7 +75,10 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_file_orders_always __read_mostly; +int huge_file_exec_order __read_mostly = -1; static bool anon_orders_configured __initdata; +static bool file_orders_configured; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, @@ -442,6 +445,7 @@ static const struct attribute_group hugepage_attr_group = { static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); +static DEFINE_SPINLOCK(huge_file_orders_lock); static LIST_HEAD(thpsize_list); static ssize_t anon_enabled_show(struct kobject *kobj, @@ -506,6 +510,52 @@ static ssize_t anon_enabled_store(struct kobject *kobj, return ret; } +static ssize_t file_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + bool exec; + + if (test_bit(order, &huge_file_orders_always)) { + exec = READ_ONCE(huge_file_exec_order) == order; + output = exec ? "always [always+exec] never" : + "[always] always+exec never"; + } else { + output = "always always+exec [never]"; + } + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t file_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + spin_lock(&huge_file_orders_lock); + + if (sysfs_streq(buf, "always")) { + set_bit(order, &huge_file_orders_always); + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else if (sysfs_streq(buf, "always+exec")) { + set_bit(order, &huge_file_orders_always); + huge_file_exec_order = order; + } else if (sysfs_streq(buf, "never")) { + clear_bit(order, &huge_file_orders_always); + if (huge_file_exec_order == order) + huge_file_exec_order = -1; + } else { + ret = -EINVAL; + } + + spin_unlock(&huge_file_orders_lock); + return ret; +} + static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); @@ -518,7 +568,11 @@ static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; +static struct kobj_attribute file_enabled_attr = + __ATTR(file_enabled, 0644, file_enabled_show, file_enabled_store); + static struct attribute *file_ctrl_attrs[] = { + &file_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif @@ -583,6 +637,7 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -604,6 +659,7 @@ static struct attribute_group anon_stats_attr_grp = { }; static struct attribute *file_stats_attrs[] = { + &file_alloc_attr.attr, #ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, @@ -725,6 +781,16 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) if (!anon_orders_configured) huge_anon_orders_inherit = BIT(PMD_ORDER); + /* + * For pagecache, default to enabling all orders. powerpc's PMD_ORDER + * (and therefore THP_ORDERS_ALL_FILE_DEFAULT) isn't a compile-time + * constant so we have to do this here. + */ + if (!file_orders_configured) { + huge_file_orders_always = THP_ORDERS_ALL_FILE_DEFAULT; + file_orders_configured = true; + } + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); @@ -972,6 +1038,87 @@ static int __init setup_thp_anon(char *str) } __setup("thp_anon=", setup_thp_anon); +static int __init setup_thp_file(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always; + char *start_size, *end_size; + int start, end, nr, exec; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); + + always = huge_file_orders_always; + exec = huge_file_exec_order; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + } else if (!strcmp(policy, "always+exec")) { + if (nr != 1) + goto err; + bitmap_set(&always, start, nr); + exec = start; + } else if (!strcmp(policy, "never")) { + bitmap_clear(&always, start, nr); + if (exec != -1 && !test_bit(exec, &always)) + exec = -1; + } else { + pr_err("invalid policy %s in thp_file boot parameter\n", policy); + goto err; + } + } + } + + huge_file_orders_always = always; + huge_file_exec_order = exec; + file_orders_configured = true; + return 1; +err: + pr_warn("thp_file=%s: cannot parse, ignored\n", str); + return 0; +} +__setup("thp_file=", setup_thp_file); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) diff --git a/mm/readahead.c b/mm/readahead.c index fb5281d0c9acdd0353f0e9b643dcf4d26843471c..5d300fbd1eba6bef97327a5d339adc087054bc9b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -486,6 +486,34 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return 0; } +static int select_new_order(int old_order, int max_order, unsigned long orders) +{ + unsigned long hi_orders, lo_orders; + + /* + * Select the next order to use from the set in `orders`, while ensuring + * we don't go above max_order. Prefer the next + 1 highest allowed + * order after old_order, unless there isn't one, in which case return + * the closest allowed order, which is either the next highest allowed + * order or less than or equal to old_order. The "next + 1" skip + * behaviour is intended to allow ramping up to large folios quickly. + */ + + orders &= BIT(max_order + 1) - 1; + VM_WARN_ON(!orders); + hi_orders = orders & ~(BIT(old_order + 1) - 1); + + if (hi_orders) { + old_order = lowest_order(hi_orders); + hi_orders &= ~BIT(old_order); + if (hi_orders) + return lowest_order(hi_orders); + } + + lo_orders = orders & (BIT(old_order + 1) - 1); + return highest_order(lo_orders); +} + void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { @@ -496,19 +524,15 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); + unsigned long orders; - if (!mapping_large_folio_support(mapping) || ra->size < 4) + if (!mapping_large_folio_support(mapping)) goto fallback; limit = min(limit, index + ra->size - 1); - if (new_order < MAX_PAGECACHE_ORDER) - new_order += 2; - - if (new_order > MAX_PAGECACHE_ORDER) - new_order = MAX_PAGECACHE_ORDER; - while ((1 << new_order) > ra->size) - new_order--; + orders = file_orders_always() | BIT(0); + new_order = select_new_order(new_order, ilog2(ra->size), orders); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); @@ -518,9 +542,10 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) - order = __ffs(index); + order = select_new_order(order, __ffs(index), orders); /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) + while (index + (1UL << order) - 1 > limit && + (BIT(order) & orders) == 0) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err)