diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b993ae44111c007b54cee4ba3a530062062af2a1..efb370e79ac3a7a215d5722e93bbbc35ce08d7eb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); @@ -24,7 +24,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) } #endif -vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); @@ -291,7 +291,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); -vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; @@ -444,8 +444,7 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, - pmd_t orig_pmd) +static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ade4993f5fab4d84facfa9876842cfc692dc1b7f..188f4b414bf892b21ace99a8e2f439da7f6b962d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -105,14 +105,9 @@ static inline void __ClearPageMovable(struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -extern bool pmd_trans_migrating(pmd_t pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); #else -static inline bool pmd_trans_migrating(pmd_t pmd) -{ - return false; -} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { @@ -120,24 +115,6 @@ static inline int migrate_misplaced_page(struct page *page, } #endif /* CONFIG_NUMA_BALANCING */ -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node); -#else -static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node) -{ - return -EAGAIN; -} -#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ - - #ifdef CONFIG_MIGRATION /* diff --git a/include/linux/mm.h b/include/linux/mm.h index e0d269b83c8f208752fc3cf8ee43dd28f9401c75..627f997bc5476fb42cd508aeafb72119348519a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -579,7 +579,13 @@ struct vm_fault { pud_t *pud; /* Pointer to pud entry matching * the 'address' */ - pte_t orig_pte; /* Value of PTE at the time of fault */ + KABI_REPLACE(pte_t orig_pte, + union { + pte_t orig_pte; /* Value of PTE at the time of fault */ + pmd_t orig_pmd; /* Value of PMD at the time of fault, + * used by PMD fault only. + */ + }) struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 20d548da46609f7a15e3e62e213bdd333b3c0562..eb293d17a1049f37c7286e93e822aec5911a531a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1262,11 +1262,12 @@ void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf) { pmd_t entry; unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; + pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) @@ -1283,11 +1284,12 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) spin_unlock(vmf->ptl); } -vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1423,160 +1425,77 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } /* NUMA hinting page fault entry point for trans huge pmds */ -vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct anon_vma *anon_vma = NULL; + pmd_t oldpmd = vmf->orig_pmd; + pmd_t pmd; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = -1; - bool page_locked; bool migrated = false; - bool was_writable; + bool was_writable = pmd_savedwrite(oldpmd); int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(pmd, *vmf->pmd))) - goto out_unlock; - - /* - * If there are potential migrations, wait for completion and retry - * without disrupting NUMA hinting information. Do not relock and - * check_same as the page may no longer be mapped. - */ - if (unlikely(pmd_trans_migrating(*vmf->pmd))) { - page = pmd_page(*vmf->pmd); - if (!get_page_unless_zero(page)) - goto out_unlock; + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); goto out; } - page = pmd_page(pmd); - BUG_ON(is_huge_zero_page(page)); - page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); - count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == this_nid) { - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - flags |= TNF_FAULT_LOCAL; - } + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + page = vm_normal_page_pmd(vma, haddr, pmd); + if (!page) + goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!pmd_savedwrite(pmd)) + if (!was_writable) flags |= TNF_NO_GROUP; - /* - * Acquire the page lock to serialise THP migrations but avoid dropping - * page_table_lock if at all possible - */ - page_locked = trylock_page(page); - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == NUMA_NO_NODE) { - /* If the page was locked, there are no parallel migrations */ - if (page_locked) - goto clear_pmdnuma; - } - - /* Migration could have started since the pmd_trans_migrating check */ - if (!page_locked) { - page_nid = NUMA_NO_NODE; - if (!get_page_unless_zero(page)) - goto out_unlock; - spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); - goto out; - } - - /* - * Page is misplaced. Page lock serialises migrations. Acquire anon_vma - * to serialises splits - */ - get_page(page); - spin_unlock(vmf->ptl); - anon_vma = page_lock_anon_vma_read(page); - - /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(vmf->ptl); - if (unlikely(!pmd_same(pmd, *vmf->pmd))) { - unlock_page(page); - put_page(page); - page_nid = NUMA_NO_NODE; - goto out_unlock; - } + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + target_nid = numa_migrate_prep(page, vma, haddr, page_nid, + &flags); - /* Bail if we fail to protect against THP splits for any reason */ - if (unlikely(!anon_vma)) { + if (target_nid == NUMA_NO_NODE) { put_page(page); - page_nid = NUMA_NO_NODE; - goto clear_pmdnuma; + goto out_map; } - /* - * Since we took the NUMA fault, we must have observed the !accessible - * bit. Make sure all other CPUs agree with that, to avoid them - * modifying the page we're about to migrate. - * - * Must be done under PTL such that we'll observe the relevant - * inc_tlb_flush_pending(). - * - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(vma->vm_mm)) { - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - /* - * change_huge_pmd() released the pmd lock before - * invalidating the secondary MMUs sharing the primary - * MMU pagetables (with ->invalidate_range()). The - * mmu_notifier_invalidate_range_end() (which - * internally calls ->invalidate_range()) in - * change_pmd_range() will run after us, so we can't - * rely on it here and we need an explicit invalidate. - */ - mmu_notifier_invalidate_range(vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); - } - - /* - * Migrate the THP to the requested node, returns with page unlocked - * and access rights restored. - */ spin_unlock(vmf->ptl); - migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - vmf->pmd, pmd, vmf->address, page, target_nid); + migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } else + } else { flags |= TNF_MIGRATE_FAIL; - - goto out; -clear_pmdnuma: - BUG_ON(!PageLocked(page)); - was_writable = pmd_savedwrite(pmd); - pmd = pmd_modify(pmd, vma->vm_page_prot); - pmd = pmd_mkyoung(pmd); - if (was_writable) - pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - unlock_page(page); -out_unlock: - spin_unlock(vmf->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + goto out; + } + goto out_map; + } out: - if (anon_vma) - page_unlock_anon_vma_read(anon_vma); - if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); return 0; + +out_map: + /* Restore the PMD */ + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + goto out; } /* @@ -1806,6 +1725,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * Returns * - 0 if PMD could not be locked * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, @@ -1820,6 +1740,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + if (prot_numa && !thp_migration_supported()) + return 1; + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; diff --git a/mm/internal.h b/mm/internal.h index a9fc47f3677f09d0b5e6ac74be7f7808f9507f07..0c4e6959b83cc1d6c97caf0f8147ec193e00199e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -384,23 +384,6 @@ extern unsigned int munlock_vma_page(struct page *page); */ extern void clear_page_mlock(struct page *page); -/* - * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() - * (because that does not go through the full procedure of migration ptes): - * to migrate the Mlocked page flag; update statistics. - */ -static inline void mlock_migrate_page(struct page *newpage, struct page *page) -{ - if (TestClearPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - /* Holding pmd lock, no change in irq context: __mod is safe */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - SetPageMlocked(newpage); - __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); - } -} - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* @@ -476,7 +459,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } -static inline void mlock_migrate_page(struct page *new, struct page *old) { } #endif /* !CONFIG_MMU */ @@ -673,4 +655,7 @@ struct migration_target_control { DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index 0eaea2680d83192663e0c669fc18035452f3ebf2..1d8a831be531d473317db6fc1c05b494bdd541d6 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -23,6 +23,7 @@ #include #include #include +#include "internal.h" struct mem_sampling_ops_struct mem_sampling_ops; @@ -128,20 +129,77 @@ static void mem_sampling_process(struct mem_sampling_record *record_base, int nr } #ifdef CONFIG_NUMABALANCING_MEM_SAMPLING - -static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid, - int *flags) +static inline void do_thp_numa_access(struct mm_struct *mm, + struct vm_area_struct *vma, + u64 vaddr, struct page *page) { - get_page(page); + int page_nid = NUMA_NO_NODE; + int target_nid, last_cpupid = -1; + bool migrated = false; + int flags = 0; + struct page *hpage = NULL; + u64 haddr = vaddr & HPAGE_PMD_MASK; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd, pmde; + spinlock_t *ptl; + + pgd = pgd_offset(mm, vaddr); + if (!pgd_present(*pgd)) + return; + + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, vaddr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, vaddr); + pmde = READ_ONCE(*pmd); + /* TODO: handle PTE-mapped THP */ + if (!pmd_trans_huge(pmde)) + return; + + ptl = pmd_lock(mm, pmd); + pmde = READ_ONCE(*pmd); + if (unlikely(!pmd_trans_huge(pmde))) + goto out_unlock; + + hpage = vm_normal_page_pmd(vma, haddr, pmde); + if (!hpage || hpage != compound_head(page)) + goto out_unlock; + + page_nid = page_to_nid(hpage); + last_cpupid = page_cpupid_last(hpage); + target_nid = numa_migrate_prep(hpage, vma, haddr, page_nid, + &flags); + spin_unlock(ptl); + if (target_nid == NUMA_NO_NODE) { + put_page(hpage); + goto out; + } - count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) { - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - *flags |= TNF_FAULT_LOCAL; + migrated = migrate_misplaced_page(hpage, vma, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } else { + flags |= TNF_MIGRATE_FAIL; } - return mpol_misplaced(page, vma, addr); +out: + trace_mm_numa_migrating(haddr, page_nid, target_nid, flags&TNF_MIGRATED); + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + flags); + + return; + +out_unlock: + spin_unlock(ptl); } /* @@ -184,15 +242,16 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) goto out_unlock; page = pfn_to_online_page(PHYS_PFN(paddr)); - if (!page || is_zone_device_page(page)) + if (!page || is_zone_device_page(page) || PageKsm(page)) goto out_unlock; if (unlikely(!PageLRU(page))) goto out_unlock; - /* TODO: handle PTE-mapped THP or PMD-mapped THP*/ - if (PageCompound(page)) + if (PageCompound(page)) { + do_thp_numa_access(mm, vma, vaddr, page); goto out_unlock; + } /* * Flag if the page is shared between multiple address spaces. This @@ -201,6 +260,10 @@ static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr) if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && page_count(page) != 1) + goto out_unlock; + last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); diff --git a/mm/memory.c b/mm/memory.c index cd72c3d63e60eb6aef3adab400e7085ca9fb3bb9..0c4da925e8ad5f29a11c12ebda21f69b2f7935aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4375,9 +4375,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid, - int *flags) +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags) { get_page(page); @@ -4490,12 +4489,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) } /* `inline' is required to avoid gcc 4.1.2 build error */ -static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) { - if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) + if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); - return do_huge_pmd_wp_page(vmf, orig_pmd); + return do_huge_pmd_wp_page(vmf); } if (vmf->vma->vm_ops->huge_fault) { vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); @@ -4716,26 +4715,26 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *vmf.pmd; + vmf.orig_pmd = *vmf.pmd; barrier(); - if (unlikely(is_swap_pmd(orig_pmd))) { + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - if (is_pmd_migration_entry(orig_pmd)) + !is_pmd_migration_entry(vmf.orig_pmd)); + if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } - if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf, orig_pmd); + if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); - if (dirty && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&vmf, orig_pmd); + if (dirty && !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&vmf, orig_pmd); + huge_pmd_set_accessed(&vmf); return 0; } } diff --git a/mm/migrate.c b/mm/migrate.c index c8491a744e8c3ce755f289eadda527194eb3bd36..3f5b217d5af14b6f9af61af9b8608b2d0c11637e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1487,6 +1487,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; + bool nosplit = (reason == MR_NUMA_MISPLACED); if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1527,8 +1528,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, * pages are added to the tail of the list so * we encounter them after the rest of the list * is processed. + * THP NUMA faulting doesn't split THP to retry. */ - if (is_thp) { + if (is_thp && !nosplit) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@ -2082,12 +2084,33 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } +static struct page *alloc_misplaced_dst_page_thp(struct page *page, + unsigned long data) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), + HPAGE_PMD_ORDER); + if (!newpage) + goto out; + + prep_transhuge_page(newpage); + +out: + return newpage; +} + static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + /* Do not migrate THP mapped by multiple processes */ + if (PageTransHuge(page) && total_mapcount(page) > 1) + return 0; + /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0; @@ -2098,18 +2121,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (isolate_lru_page(page)) return 0; - /* - * migrate_misplaced_transhuge_page() skips page migration's usual - * check on page_count(), so we must do it here, now that the page - * has been isolated: a GUP pin, or any other pin, prevents migration. - * The expected page count is 3: 1 for page's mapcount and 1 for the - * caller's pin and 1 for the reference taken by isolate_lru_page(). - */ - if (PageTransHuge(page) && page_count(page) != 3) { - putback_lru_page(page); - return 0; - } - page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, thp_nr_pages(page)); @@ -2123,12 +2134,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } -bool pmd_trans_migrating(pmd_t pmd) -{ - struct page *page = pmd_page(pmd); - return PageLocked(page); -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2141,6 +2146,21 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int isolated; int nr_remaining; LIST_HEAD(migratepages); + new_page_t *new; + bool compound; + int nr_pages = thp_nr_pages(page); + + /* + * PTE mapped THP or HugeTLB page can't reach here so the page could + * be either base page or THP. And it must be head page if it is + * THP. + */ + compound = PageTransHuge(page); + + if (compound) + new = alloc_misplaced_dst_page_thp; + else + new = alloc_misplaced_dst_page; /* * Don't migrate file pages that are mapped in multiple processes @@ -2162,19 +2182,18 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - NULL, node, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); + nr_remaining = migrate_pages(&migratepages, *new, NULL, node, + MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -nr_pages); putback_lru_page(page); } isolated = 0; } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); BUG_ON(!list_empty(&migratepages)); return isolated; @@ -2184,142 +2203,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, } #endif /* CONFIG_NUMA_BALANCING */ -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -/* - * Migrates a THP to a given target node. page must be locked and is unlocked - * before returning. - */ -int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node) -{ - spinlock_t *ptl; - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - struct page *new_page = NULL; - int page_lru = page_is_file_lru(page); - unsigned long start = address & HPAGE_PMD_MASK; - - new_page = alloc_pages_node(node, - (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), - HPAGE_PMD_ORDER); - if (!new_page) - goto out_fail; - prep_transhuge_page(new_page); - - isolated = numamigrate_isolate_page(pgdat, page); - if (!isolated) { - put_page(new_page); - goto out_fail; - } - - /* Prepare a page as a migration target */ - __SetPageLocked(new_page); - if (PageSwapBacked(page)) - __SetPageSwapBacked(new_page); - - /* anon mapping, we can simply copy page->mapping to the new page: */ - new_page->mapping = page->mapping; - new_page->index = page->index; - /* flush the cache before copying using the kernel virtual address */ - flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); - migrate_page_copy(new_page, page); - WARN_ON(PageLRU(new_page)); - - /* Recheck the target PMD */ - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { - spin_unlock(ptl); - - /* Reverse changes made by migrate_page_copy() */ - if (TestClearPageActive(new_page)) - SetPageActive(page); - if (TestClearPageUnevictable(new_page)) - SetPageUnevictable(page); - - unlock_page(new_page); - put_page(new_page); /* Free it */ - - /* Retake the callers reference and putback on LRU */ - get_page(page); - putback_lru_page(page); - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); - - goto out_unlock; - } - - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - - /* - * Overwrite the old entry under pagetable lock and establish - * the new PTE. Any parallel GUP will either observe the old - * page blocking on the page lock, block on the page table - * lock or observe the new page. The SetPageUptodate on the - * new page and page_add_new_anon_rmap guarantee the copy is - * visible before the pagetable update. - */ - reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); - page_add_anon_rmap(new_page, vma, start, true); - /* - * At this point the pmd is numa/protnone (i.e. non present) and the TLB - * has already been flushed globally. So no TLB can be currently - * caching this non present pmd mapping. There's no need to clear the - * pmd before doing set_pmd_at(), nor to flush the TLB after - * set_pmd_at(). Clearing the pmd here would introduce a race - * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the - * mmap_lock for reading. If the pmd is set to NULL at any given time, - * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this - * pmd. - */ - set_pmd_at(mm, start, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); - - page_ref_unfreeze(page, 2); - mlock_migrate_page(new_page, page); - reliable_page_counter(page, vma->vm_mm, -HPAGE_PMD_NR); - page_remove_rmap(page, true); - set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); - - spin_unlock(ptl); - - /* Take an "isolate" reference and put new page on the LRU. */ - get_page(new_page); - putback_lru_page(new_page); - - unlock_page(new_page); - unlock_page(page); - put_page(page); /* Drop the rmap reference */ - put_page(page); /* Drop the LRU isolation reference */ - - count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); - count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_lru, - -HPAGE_PMD_NR); - return isolated; - -out_fail: - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - ptl = pmd_lock(mm, pmd); - if (pmd_same(*pmd, entry)) { - entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, start, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); - } - spin_unlock(ptl); - -out_unlock: - unlock_page(page); - put_page(page); - return 0; -} -#endif /* CONFIG_NUMA_BALANCING */ - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE