diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 639eb70634f4443b7c60cc642ac21ac3f0ff3d7b..900ed13ba87d897b3527e6f01eb8cde6c31cc2d8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -869,8 +869,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, - true, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 58ce4efb2019c4864df4f7605c59c079bf89f414..18f7cfb7fca4b0626fb4fce5d577cb349bbfd06e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -89,8 +89,12 @@ extern struct kobj_attribute shmem_enabled_attr; */ #define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) -#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ +#define TVA_IN_PF (1 << 1) /* Page fault handler */ +#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ + +#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT @@ -216,17 +220,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @smaps: whether answer will be used for smaps file - * @in_pf: whether answer will be used by page fault handler - * @enforce_sysfs: whether sysfs config should be taken into account + * @tva_flags: Which TVA flags to honour * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -239,12 +241,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { /* Optimization to check if required orders are enabled early. */ - if (enforce_sysfs && vma_is_anonymous(vma)) { + if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -258,8 +260,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, - enforce_sysfs, orders); + return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } enum mthp_stat_item { @@ -437,8 +438,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1eda738509c67b29bcf50ec218a861b1160458d..eddb7984610de624f9cb031dd265692eaecfff80 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -77,10 +77,13 @@ unsigned long huge_anon_orders_inherit __read_mostly; unsigned long huge_pcp_allow_orders __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; /* Check the intersection of requested and supported orders. */ orders &= vma_is_anonymous(vma) ? THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; @@ -3155,6 +3158,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + bool is_thp = folio_test_pmd_mappable(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -3333,7 +3337,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + if (is_thp) + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } @@ -3395,7 +3400,8 @@ void deferred_split_folio(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(&folio->_deferred_list)) { - count_vm_event(THP_DEFERRED_SPLIT_PAGE); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3604,6 +3610,9 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!is_transparent_hugepage(folio)) goto next; + if (new_order >= folio_order(folio)) + goto next; + total++; /* * For folios with private, split_huge_page_to_list_to_order() @@ -3671,6 +3680,9 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, total++; nr_pages = folio_nr_pages(folio); + if (new_order >= folio_order(folio)) + goto next; + if (!folio_trylock(folio)) goto next; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5f999528ec30811364721530c639a294c8d4e6f8..fa787464662f64d3078eed306a1b1db237ec5d74 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -459,7 +459,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -925,6 +925,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -935,8 +936,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1527,8 +1527,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2403,8 +2402,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2741,8 +2740,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; if (task_in_dynamic_pool(current)) diff --git a/mm/memory.c b/mm/memory.c index fa4d1b499511043f10284ccdea10de68b58bbc17..a8f0df59aca1eb6e7d33a5e27a697332adb730f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3532,6 +3532,16 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { + /* + * We could currently only reuse a subpage of a large folio if no + * other subpages of the large folios are still mapped. However, + * let's just consistently not reuse subpages even if we could + * reuse in that scenario, and give back a large folio a bit + * sooner. + */ + if (folio_test_large(folio)) + return false; + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing @@ -4333,8 +4343,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4807,7 +4817,8 @@ static int fault_around_bytes_set(void *data, u64 val) * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ - fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); + val = max(val, PAGE_SIZE); + fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } @@ -5071,51 +5082,17 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, - unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; - old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); -} - -static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, - struct folio *folio, pte_t fault_pte, - bool ignore_writable, bool pte_write_upgrade) -{ - int nr = pte_pfn(fault_pte) - folio_pfn(folio); - unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); - unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); - pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; - unsigned long addr; - - /* Restore all PTEs' mapping of the large folio */ - for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { - pte_t ptent = ptep_get(start_ptep); - bool writable = false; - - if (!pte_present(ptent) || !pte_protnone(ptent)) - continue; - - if (pfn_folio(pte_pfn(ptent)) != folio) - continue; - - if (!ignore_writable) { - ptent = pte_modify(ptent, vma->vm_page_prot); - writable = pte_write(ptent); - if (!writable && pte_write_upgrade && - can_change_pte_writable(vma, addr, ptent)) - writable = true; - } - - numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); - } + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -5123,26 +5100,25 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false, ignore_writable = false; - bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0, nr_pages; + int flags = 0; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. + * The pte cannot be used safely until we verify, while holding the page + * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + /* Read the live PTE from the page tables: */ + old_pte = ptep_get(vmf->pte); + + if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Get the normal PTE */ - old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); /* @@ -5150,7 +5126,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && pte_write_upgrade && + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -5158,6 +5134,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; + /* TODO: handle PTE-mapped THP */ + if (folio_test_large(folio)) + goto out_map; + /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5177,7 +5157,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_SHARED; nid = folio_nid(folio); - nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -5194,7 +5173,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; - ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5215,19 +5193,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, nr_pages, flags); + task_numa_fault(last_cpupid, nid, 1, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - if (folio && folio_test_large(folio)) - numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, - pte_write_upgrade); - else - numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, - writable); + numa_rebuild_single_mapping(vmf, vma, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5434,7 +5407,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER) && + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER) && !task_in_dynamic_pool(current)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -5469,7 +5443,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER) && + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER) && !task_in_dynamic_pool(current)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) diff --git a/mm/mprotect.c b/mm/mprotect.c index b360577be4f8819637af6a13a4a8da3989d30dff..f121c46f6e4c43aa131e72a2b37d63eff7d7c37c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -129,8 +129,7 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || - folio_likely_mapped_shared(folio))) + folio_ref_count(folio) != 1) continue; /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e6058942a0843872fdde61cc1cf21e6e2a88c931..6320c0dfba0d4e3c7b4843e8882044949a20d822 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2343,6 +2343,9 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + if (unlikely(!vmap_initialized)) + return NULL; + /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed