diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 95974b69e20207840b0f4dad0ec927301b512330..c3b38c890b45809de5f5067e9da71f6f1e60d5a9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -109,6 +109,7 @@ config ARM64 select ARCH_SUPPORTS_SCHED_SOFT_QUOTA select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index f3ba33f1f521acea884eec2e66f9cdbebced0f56..409890bad06eb924b32f9b9950a7a5725b4ab41b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1197,6 +1197,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set CONFIG_THP_SWAP=y CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2b66aab73dbc42b7eda8c8833beccd6410c32d8e..f8afbcacac8fea529469719d76f3f9adda3eaa45 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -249,6 +249,15 @@ static inline pte_t pte_mkpresent(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_VALID)); } +static inline pte_t pte_clrhuge(pte_t pte) +{ + pteval_t mask = PTE_TYPE_MASK & ~PTE_VALID; + pteval_t val = PTE_TYPE_PAGE & ~PTE_VALID; + + return __pte((pte_val(pte) & ~mask) | val); +} +#define pte_clrhuge pte_clrhuge + static inline pmd_t pmd_mkcont(pmd_t pmd) { return __pmd(pmd_val(pmd) | PMD_SECT_CONT); @@ -347,6 +356,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -514,6 +524,19 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); +} + +extern bool nohugepfnmap; +#define arch_needs_pgtable_deposit(vma) \ + (nohugepfnmap ? false : (!vma_is_dax(vma) && vma_is_special_huge(vma))) + +#endif + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) @@ -531,6 +554,22 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + unsigned long pfn = pud_pfn(pud); + + return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8a6e6b6daa9065be56ab1d8d5b6907f3f55da5ce..03498ce61f5c06a014d908df55acf95ec377a936 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1394,7 +1394,7 @@ extern int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, * slot information. */ #define arch_needs_pgtable_deposit arch_needs_pgtable_deposit -static inline bool arch_needs_pgtable_deposit(void) +static inline bool arch_needs_pgtable_deposit(struct vm_area_struct *vma) { if (radix_enabled()) return false; diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index db2fe941e4c8b92e300b0274609fb9706b7ba72f..c395ca4e769629a664476e01926926be04b894fc 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,6 +65,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 93b90757d226dd34f0192ecbd52648bf5f1e9b56..1f4a630f22b5ddaeb226c725f40b645319bb9887 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -912,6 +912,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index be9bcc50e4cbf2efdc1c145550ce7410d355fb43..5877be1af301d758f9dacabd53045ed40c5f2a60 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -782,6 +782,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return __pmd(pte_val(pte)); } +#define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t entry) { unsigned long val = pmd_val(entry); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 05225ce10c34601bf3c6c71f39ba164fdd783286..d97109f9d0343613641453a6613fe649e4c0ebc4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 6b9fa19c873a30175f64d515d33c53b1a1e08743..d6fbe9d76e51fd441e3e3e6c4ed151fce996ace8 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1162,6 +1162,8 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y CONFIG_THP_SWAP=y CONFIG_READ_ONLY_THP_FOR_FS=y CONFIG_PGTABLE_HAS_HUGE_LEAVES=y +CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y +CONFIG_ARCH_SUPPORTS_PMD_PFNMAP=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 993d49cd379a2c8007ee61b155375de89f1c62d4..c170ca166e7848afdf96ffca38daa50942aa5c03 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -121,6 +121,34 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -304,6 +332,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -453,6 +505,7 @@ static inline pte_t pte_clrhuge(pte_t pte) { return pte_clear_flags(pte, _PAGE_PSE); } +#define pte_clrhuge pte_clrhuge static inline pte_t pte_mkglobal(pte_t pte) { @@ -474,20 +527,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { @@ -582,20 +621,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v | set); -} - -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { diff --git a/fs/dax.c b/fs/dax.c index 8c09578fa03573e1df14856367ba175e67739491..a65e1a088a8aea54c940c589f0c46da67ce6068e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1221,7 +1221,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); - if (arch_needs_pgtable_deposit()) { + if (arch_needs_pgtable_deposit(vma)) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 47a6634e74c4837597343151644ac52e5f4e9c70..328b1fbb134c0b07e683e642a0e30388ab676256 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -83,9 +83,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for file THP. Folios in a DAX * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to - * it. + * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_FILE_DAX \ +#define THP_ORDERS_ALL_SPECIAL \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -94,7 +94,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ @@ -454,11 +454,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); @@ -595,11 +590,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return false; } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; diff --git a/include/linux/mm.h b/include/linux/mm.h index e364a2846a786aa4466022fd50ba04bf1933893d..e8d8b032024af77f34a878dddcae18fba76a627f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2779,6 +2779,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -3603,6 +3627,8 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); +int remap_pfn_range_try_pmd(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e33015ae2c4d4e4aaa60ef864b8997a6d9f67322..ca2aff867ffed93500369339006e593cd9e3350e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -657,6 +657,12 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP +#ifndef pte_clrhuge +#define pte_clrhuge(pte) (pte) +#endif +#endif + #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of @@ -940,7 +946,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit -#define arch_needs_pgtable_deposit() (false) +#define arch_needs_pgtable_deposit(vma) (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1943,6 +1949,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: diff --git a/mm/Kconfig b/mm/Kconfig index 5972d143fb2ba57c7aed98bd3ef6aebf89c79f78..4eb0642b71e5cb49292f0afa60bccac1842ad50b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -903,6 +903,19 @@ endif # TRANSPARENT_HUGEPAGE config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# TODO: Allow to be enabled without THP +config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n + depends on TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PMD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PUD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/gup.c b/mm/gup.c index e38d3bd4d3f94d425fbb91c39c3b236ce346f21f..20b918ed1b4a75e0bf33575a661f4cf0159b6824 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2883,6 +2883,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pmd_special(orig)) + return 0; + if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; @@ -2927,6 +2930,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pud_special(orig)) + return 0; + if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 11162cf63f449a3e14b170b517e162dc9f7ca218..2bd84157869e00791942fa4fed7867b980b9225b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -97,8 +97,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_dax(vma)) - supported_orders = THP_ORDERS_ALL_FILE_DAX; + else if (vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -1593,6 +1593,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); + else + entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -1644,7 +1646,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - if (arch_needs_pgtable_deposit()) { + if (arch_needs_pgtable_deposit(vma)) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; @@ -1676,10 +1678,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { - if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; - } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) @@ -1691,6 +1691,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); + else + entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1804,6 +1806,33 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgtable_t pgtable = NULL; int ret = -ENOMEM; + pmd = pmdp_get_lockless(src_pmd); + if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + pgtable = pte_alloc_one(dst_mm); + if (unlikely(!pgtable)) + goto out; + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + /* + * No need to recheck the pmd, it can't change with write + * mmap lock held here. + * + * Meanwhile, making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + + /* dax won't reach here, it will be intercepted at vma_needs_copy() */ + VM_WARN_ON_ONCE(vma_is_dax(src_vma)); + + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + goto set_pmd; + } + /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; @@ -1886,7 +1915,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_mkold(pmd_wrprotect(pmd)); + pmd = pmd_wrprotect(pmd); +set_pmd: + pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -1971,21 +2002,15 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; - /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ - } - /* * TODO: once we support anonymous pages, use * folio_try_dup_anon_rmap_*() and split if duplicating fails. */ - pudp_set_wrprotect(src_mm, addr, src_pud); - pud = pud_mkold(pud_wrprotect(pud)); + if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_wrprotect(pud); + } + pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; @@ -2453,7 +2478,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { - if (arch_needs_pgtable_deposit()) + if (arch_needs_pgtable_deposit(vma)) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else if (is_huge_zero_pmd(orig_pmd)) { @@ -2485,7 +2510,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { - if (arch_needs_pgtable_deposit()) + if (arch_needs_pgtable_deposit(vma)) zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); @@ -2867,14 +2892,28 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); - /* - * We are going to unmap this huge page. So - * just go ahead and zap it - */ - if (arch_needs_pgtable_deposit()) - zap_deposited_table(mm, pmd); - if (vma_is_special_huge(vma)) + if (vma_is_special_huge(vma)) { + pte_t entry; + + if (vma_is_dax(vma)) + return; + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + if (unlikely(!pgtable)) + return; + pmd_populate(mm, &_pmd, pgtable); + pte = pte_offset_map(&_pmd, haddr); + entry = pte_clrhuge(pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd))); + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); + pte_unmap(pte); + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); return; + } else if (arch_needs_pgtable_deposit(vma)) { + /* Zap for the non-special mappings. */ + zap_deposited_table(mm, pmd); + } + if (unlikely(is_pmd_migration_entry(old_pmd))) { swp_entry_t entry; diff --git a/mm/memory.c b/mm/memory.c index cf573f8f776483a3025e4f6445c80154afd0c085..f0584fec93c9f4027cb57be4c693eee601c99f1b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -674,11 +674,10 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, { unsigned long pfn = pmd_pfn(pmd); - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ + /* Currently it's only used for huge pfnmaps */ + if (unlikely(pmd_special(pmd))) + return NULL; + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -2586,9 +2585,59 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return err; } +#if defined(CONFIG_ARM64) && defined(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) +bool __ro_after_init nohugepfnmap; + +static int __init set_nohugepfnmap(char *str) +{ + nohugepfnmap = true; + return 0; +} +early_param("nohugepfnmap", set_nohugepfnmap); + +static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot, + unsigned int page_shift) +{ + pgtable_t pgtable; + spinlock_t *ptl; + + if (nohugepfnmap) + return 0; + + if (page_shift < PMD_SHIFT) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(pfn, HPAGE_PMD_NR)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + pgtable = pte_alloc_one(mm); + if (unlikely(!pgtable)) + return 0; + + mm_inc_nr_ptes(mm); + ptl = pmd_lock(mm, pmd); + set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot)))); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + spin_unlock(ptl); + + return 1; +} +#endif + static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) + unsigned long pfn, pgprot_t prot, unsigned int page_shift) { pmd_t *pmd; unsigned long next; @@ -2601,6 +2650,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); +#if defined(CONFIG_ARM64) && defined(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) + if (remap_try_huge_pmd(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot, page_shift)) { + continue; + } +#endif err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) @@ -2611,7 +2666,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) + unsigned long pfn, pgprot_t prot, unsigned int page_shift) { pud_t *pud; unsigned long next; @@ -2624,7 +2679,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot); + pfn + (addr >> PAGE_SHIFT), prot, page_shift); if (err) return err; } while (pud++, addr = next, addr != end); @@ -2633,7 +2688,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) + unsigned long pfn, pgprot_t prot, unsigned int page_shift) { p4d_t *p4d; unsigned long next; @@ -2646,7 +2701,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, - pfn + (addr >> PAGE_SHIFT), prot); + pfn + (addr >> PAGE_SHIFT), prot, page_shift); if (err) return err; } while (p4d++, addr = next, addr != end); @@ -2654,7 +2709,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, } static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) + unsigned long pfn, unsigned long size, pgprot_t prot, unsigned int page_shift) { pgd_t *pgd; unsigned long next; @@ -2698,7 +2753,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot); + pfn + (addr >> PAGE_SHIFT), prot, page_shift); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -2706,15 +2761,10 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad return 0; } -/* - * Variant of remap_pfn_range that does not call track_pfn_remap. The caller - * must have pre-validated the caching bits of the pgprot_t. - */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int __remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot, unsigned int page_shift) { - int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); - + int error = remap_pfn_range_internal(vma, addr, pfn, size, prot, page_shift); if (!error) return 0; @@ -2727,6 +2777,16 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return __remap_pfn_range_notrack(vma, addr, pfn, size, prot, PAGE_SHIFT); +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2739,8 +2799,9 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, * * Return: %0 on success, negative error code otherwise. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int __remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot, + unsigned int page_shift) { int err; @@ -2748,13 +2809,28 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) return -EINVAL; - err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + err = __remap_pfn_range_notrack(vma, addr, pfn, size, prot, page_shift); if (err) untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return __remap_pfn_range(vma, addr, pfn, size, prot, PAGE_SHIFT); +} EXPORT_SYMBOL(remap_pfn_range); +#if defined(CONFIG_ARM64) && defined(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) +int remap_pfn_range_try_pmd(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return __remap_pfn_range(vma, addr, pfn, size, prot, PMD_SHIFT); +} +EXPORT_SYMBOL_GPL(remap_pfn_range_try_pmd); +#endif + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -4965,7 +5041,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + if (arch_needs_pgtable_deposit(vma) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; @@ -4988,7 +5064,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) /* * deposit and withdraw with pmd lock held */ - if (arch_needs_pgtable_deposit()) + if (arch_needs_pgtable_deposit(vma)) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);