diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d1c4d7a769c0aec827075d6e9743dad717360220..12ecdfe5a15ba93997406713ff643c2166eb6de5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2441,6 +2441,15 @@ config UNWIND_PATCH_PAC_INTO_SCS select UNWIND_TABLES select DYNAMIC_SCS +config ARM64_CONTPTE + bool "Contiguous PTE mappings for user memory" if EXPERT + depends on TRANSPARENT_HUGEPAGE + default y + help + When enabled, user mappings are configured using the PTE contiguous + bit, for any mappings that meet the size and alignment requirements. + This reduces TLB pressure and improves performance. + config CLEAR_USER_WORKAROUND bool "Enable clear user workaround" depends on ARCH_HISI diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 52d0b0a763f1640905e5b10ab170e67e6fe6db93..401087e8a43dc08621a0a60ca6d3fe431649bb4e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) +#define __pte_clear(mm, addr, ptep) \ + __set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) /* @@ -132,12 +133,16 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) */ #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) +/* + * Returns true if the pte is valid and has the contiguous bit set. + */ +#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access - * flag, since ptep_clear_flush_young() elides a DSB when invalidating the + * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB. */ #define pte_accessible(mm, pte) \ @@ -261,7 +266,7 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); @@ -275,6 +280,11 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } +static inline pte_t __ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} + extern void __sync_icache_dcache(pte_t pteval); bool pgattr_change_is_safe(u64 old, u64 new); @@ -302,7 +312,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = READ_ONCE(*ptep); + old_pte = __ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -311,7 +321,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, /* * Check for potential race with hardware updates of the pte - * (ptep_set_access_flags safely changes valid ptes without going + * (__ptep_set_access_flags safely changes valid ptes without going * through an invalid entry). */ VM_WARN_ONCE(!pte_young(pte), @@ -351,29 +361,28 @@ static inline pgprot_t pte_pgprot(pte_t pte) return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); } -#define pte_next_pfn pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#define pte_advance_pfn pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte)); + return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } -static inline void set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); __sync_cache_and_tags(pte, nr); for (;;) { __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); if (--nr == 0) break; ptep++; - pte = pte_next_pfn(pte); + pte = pte_advance_pfn(pte, 1); } } -#define set_ptes set_ptes /* * Huge pte definitions. @@ -540,7 +549,7 @@ static inline void __set_pte_at(struct mm_struct *mm, { __sync_cache_and_tags(pte, nr); __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, @@ -854,8 +863,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, +extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -865,7 +873,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); + return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty); } static inline int pud_devmap(pud_t pud) @@ -899,12 +908,13 @@ static inline bool pud_user_accessible_page(pud_t pud) /* * Atomic pte/pmd modifications. */ -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(pte_t *ptep) +static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -915,18 +925,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) return pte_young(pte); } -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) -{ - return __ptep_test_and_clear_young(ptep); -} - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, +static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young = ptep_test_and_clear_young(vma, address, ptep); + int young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -949,12 +951,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); + return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); @@ -964,6 +965,37 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return pte; } +static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + __ptep_get_and_clear(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + +static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = __ptep_get_and_clear(mm, addr, ptep); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -977,16 +1009,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. - */ -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) +static inline void ___ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + pte_t pte) { - pte_t old_pte, pte; + pte_t old_pte; - pte = READ_ONCE(*ptep); do { old_pte = pte; pte = pte_wrprotect(pte); @@ -995,12 +1023,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } while (pte_val(pte) != pte_val(old_pte)); } +/* + * __ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + */ +static inline void __ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep) +{ + ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep)); +} + +static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + unsigned int i; + + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++) + __ptep_set_wrprotect(mm, address, ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - ptep_set_wrprotect(mm, address, (pte_t *)pmdp); + __ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #define pmdp_establish pmdp_establish @@ -1078,7 +1125,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_pte_at() function. + * On AArch64, the cache coherency is handled via the __set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1130,6 +1177,282 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); + +#ifdef CONFIG_ARM64_CONTPTE + +/* + * The contpte APIs are used to transparently manage the contiguous bit in ptes + * where it is possible and makes sense to do so. The PTE_CONT bit is considered + * a private implementation detail of the public ptep API (see below). + */ +extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); +extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); +extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); +extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full); +extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full); +extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr); +extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty); + +static __always_inline void contpte_try_fold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + /* + * Only bother trying if both the virtual and physical addresses are + * aligned and correspond to the last entry in a contig range. The core + * code mostly modifies ranges from low to high, so this is the likely + * the last modification in the contig range, so a good time to fold. + * We can't fold special mappings, because there is no associated folio. + */ + + const unsigned long contmask = CONT_PTES - 1; + bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask; + + if (unlikely(valign)) { + bool palign = (pte_pfn(pte) & contmask) == contmask; + + if (unlikely(palign && + pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))) + __contpte_try_fold(mm, addr, ptep, pte); + } +} + +static __always_inline void contpte_try_unfold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + if (unlikely(pte_valid_cont(pte))) + __contpte_try_unfold(mm, addr, ptep, pte); +} + +#define pte_batch_hint pte_batch_hint +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + if (!pte_valid_cont(pte)) + return 1; + + return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1)); +} + +/* + * The below functions constitute the public API that arm64 presents to the + * core-mm to manipulate PTE entries within their page tables (or at least this + * is the subset of the API that arm64 needs to implement). These public + * versions will automatically and transparently apply the contiguous bit where + * it makes sense to do so. Therefore any users that are contig-aware (e.g. + * hugetlb, kernel mapper) should NOT use these APIs, but instead use the + * private versions, which are prefixed with double underscore. All of these + * APIs except for ptep_get_lockless() are expected to be called with the PTL + * held. Although the contiguous bit is considered private to the + * implementation, it is deliberately allowed to leak through the getters (e.g. + * ptep_get()), back to core code. This is required so that pte_leaf_size() can + * provide an accurate size for perf_get_pgtable_size(). But this leakage means + * its possible a pte will be passed to a setter with the contiguous bit set, so + * we explicitly clear the contiguous bit in those cases to prevent accidentally + * setting it in the pgtable. + */ + +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get(ptep, pte); +} + +#define ptep_get_lockless ptep_get_lockless +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get_lockless(ptep); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + /* + * We don't have the mm or vaddr so cannot unfold contig entries (since + * it requires tlb maintenance). set_pte() is not used in core code, so + * this should never even be called. Regardless do our best to service + * any call and emit a warning if there is any attempt to set a pte on + * top of an existing contig range. + */ + pte_t orig_pte = __ptep_get(ptep); + + WARN_ON_ONCE(pte_valid_cont(orig_pte)); + __set_pte(ptep, pte_mknoncont(pte)); +} + +#define set_ptes set_ptes +static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + pte = pte_mknoncont(pte); + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __set_ptes(mm, addr, ptep, pte, 1); + contpte_try_fold(mm, addr, ptep, pte); + } else { + contpte_set_ptes(mm, addr, ptep, pte, nr); + } +} + +static inline void pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __pte_clear(mm, addr, ptep); +} + +#define clear_full_ptes clear_full_ptes +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __clear_full_ptes(mm, addr, ptep, nr, full); + } else { + contpte_clear_full_ptes(mm, addr, ptep, nr, full); + } +} + +#define get_and_clear_full_ptes get_and_clear_full_ptes +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte; + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } else { + pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } + + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + return __ptep_get_and_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_test_and_clear_young(vma, addr, ptep); + + return contpte_ptep_test_and_clear_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +static inline int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_clear_flush_young(vma, addr, ptep); + + return contpte_ptep_clear_flush_young(vma, addr, ptep); +} + +#define wrprotect_ptes wrprotect_ptes +static __always_inline void wrprotect_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + if (likely(nr == 1)) { + /* + * Optimization: wrprotect_ptes() can only be called for present + * ptes so we only need to check contig bit as condition for + * unfold, and we can remove the contig bit from the pte we read + * to avoid re-reading. This speeds up fork() which is sensitive + * for order-0 folios. Equivalent to contpte_try_unfold(). + */ + pte_t orig_pte = __ptep_get(ptep); + + if (unlikely(pte_cont(orig_pte))) { + __contpte_try_unfold(mm, addr, ptep, orig_pte); + orig_pte = pte_mknoncont(orig_pte); + } + ___ptep_set_wrprotect(mm, addr, ptep, orig_pte); + } else { + contpte_wrprotect_ptes(mm, addr, ptep, nr); + } +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + wrprotect_ptes(mm, addr, ptep, 1); +} + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + pte_t orig_pte = __ptep_get(ptep); + + entry = pte_mknoncont(entry); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + + return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); +} + +#else /* CONFIG_ARM64_CONTPTE */ + +#define ptep_get __ptep_get +#define set_pte __set_pte +#define set_ptes __set_ptes +#define pte_clear __pte_clear +#define clear_full_ptes __clear_full_ptes +#define get_and_clear_full_ptes __get_and_clear_full_ptes +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define ptep_get_and_clear __ptep_get_and_clear +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young __ptep_test_and_clear_young +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young __ptep_clear_flush_young +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define ptep_set_wrprotect __ptep_set_wrprotect +#define wrprotect_ptes __wrprotect_ptes +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags __ptep_set_access_flags + +#endif /* CONFIG_ARM64_CONTPTE */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index b149cf9f91bc965d53f97689a83ec653e8278a8d..2e7a9b70ea6e1d2f495bc75e078c13a94572413a 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -400,7 +400,7 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false) -static inline void __flush_tlb_range(struct vm_area_struct *vma, +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) @@ -432,10 +432,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, else __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true); - dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } +static inline void __flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level, + int tlb_level) +{ + __flush_tlb_range_nosync(vma, start, end, stride, + last_level, tlb_level); + dsb(ish); +} + static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 2b478ca356b00f1f71c11d40e6bb6eb66cf9b173..89d104c0bce656baa0a3140f538bb7d5f456fe3e 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -107,7 +107,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -116,7 +116,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 2fb5e7a7a4d5e27e64a969911b080972f11ea0df..cea96ee75d22d1cb1f3e63e3beaedca710322e38 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 89e33337ade91be2385f366272af6dd54c83b7fc..3ebbb81b15683368cc6362e7a99cd6c1c2a7ed35 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1110,7 +1110,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_pte_at() in the VMM but still overriding the + * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d00d364e27e27c1f5fff2bc48e81f..60454256945b8b3ff47bdb609b51b3d5ad9b0572 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o fixmap.o +obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c new file mode 100644 index 0000000000000000000000000000000000000000..16788f07716d58a1b92221c1ce781c77d025bc33 --- /dev/null +++ b/arch/arm64/mm/contpte.c @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +static inline bool mm_is_user(struct mm_struct *mm) +{ + /* + * Don't attempt to apply the contig bit to kernel mappings, because + * dynamically adding/removing the contig bit can cause page faults. + * These racing faults are ok for user space, since they get serialized + * on the PTL. But kernel mappings can't tolerate faults. + */ + if (unlikely(mm_is_efi(mm))) + return false; + return mm != &init_mm; +} + +static inline pte_t *contpte_align_down(pte_t *ptep) +{ + return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); +} + +static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * Unfold any partially covered contpte block at the beginning and end + * of the range. + */ + + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + + if (ptep + nr != contpte_align_down(ptep + nr)) { + unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); + pte_t *last_ptep = ptep + nr - 1; + + contpte_try_unfold(mm, last_addr, last_ptep, + __ptep_get(last_ptep)); + } +} + +static void contpte_convert(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long start_addr; + pte_t *start_ptep; + int i; + + start_ptep = ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { + pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(ptent)) + pte = pte_mkdirty(pte); + + if (pte_young(ptent)) + pte = pte_mkyoung(pte); + } + + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + + __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); +} + +void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the virtual and pysical addresses are + * correctly aligned for a contpte mapping in contpte_try_fold() so the + * remaining checks are to ensure that the contpte range is fully + * covered by a single folio, and ensure that all the ptes are valid + * with contiguous PFNs and matching prots. We ignore the state of the + * access and dirty bits for the purpose of deciding if its a contiguous + * range; the folding process will generate a single contpte entry which + * has a single access and dirty bit. Those 2 bits are the logical OR of + * their respective bits in the constituent pte entries. In order to + * ensure the contpte range is covered by a single folio, we must + * recover the folio from the pfn, but special mappings don't have a + * folio backing them. Fortunately contpte_try_fold() already checked + * that the pte is not special - we never try to fold special mappings. + * Note we can't use vm_normal_page() for this since we don't have the + * vma. + */ + + unsigned long folio_start, folio_end; + unsigned long cont_start, cont_end; + pte_t expected_pte, subpte; + struct folio *folio; + struct page *page; + unsigned long pfn; + pte_t *orig_ptep; + pgprot_t prot; + + int i; + + if (!mm_is_user(mm)) + return; + + page = pte_page(pte); + folio = page_folio(page); + folio_start = addr - (page - &folio->page) * PAGE_SIZE; + folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; + cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); + cont_end = cont_start + CONT_PTE_SIZE; + + if (folio_start > cont_start || folio_end < cont_end) + return; + + pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + expected_pte = pfn_pte(pfn, prot); + orig_ptep = ptep; + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++) { + subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); + if (!pte_same(subpte, expected_pte)) + return; + expected_pte = pte_advance_pfn(expected_pte, 1); + ptep++; + } + + pte = pte_mkcont(pte); + contpte_convert(mm, addr, orig_ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_fold); + +void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the ptes are contiguous in + * contpte_try_unfold(), so just check that the mm is user space. + */ + if (!mm_is_user(mm)) + return; + + pte = pte_mknoncont(pte); + contpte_convert(mm, addr, ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_unfold); + +pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We are guaranteed to be holding the PTL, so any + * contiguous range cannot be unfolded or otherwise modified under our + * feet. + */ + + pte_t pte; + int i; + + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get); + +pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We may not be holding the PTL, so any contiguous + * range may be unfolded/modified/refolded under our feet. Therefore we + * ensure we read a _consistent_ contpte range by checking that all ptes + * in the range are valid and have CONT_PTE set, that all pfns are + * contiguous and that all pgprots are the same (ignoring access/dirty). + * If we find a pte that is not consistent, then we must be racing with + * an update so start again. If the target pte does not have CONT_PTE + * set then that is considered consistent on its own because it is not + * part of a contpte range. + */ + + pgprot_t orig_prot; + unsigned long pfn; + pte_t orig_pte; + pgprot_t prot; + pte_t *ptep; + pte_t pte; + int i; + +retry: + orig_pte = __ptep_get(orig_ptep); + + if (!pte_valid_cont(orig_pte)) + return orig_pte; + + orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); + ptep = contpte_align_down(orig_ptep); + pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + if (!pte_valid_cont(pte) || + pte_pfn(pte) != pfn || + pgprot_val(prot) != pgprot_val(orig_prot)) + goto retry; + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get_lockless); + +void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + unsigned long next; + unsigned long end; + unsigned long pfn; + pgprot_t prot; + + /* + * The set_ptes() spec guarantees that when nr > 1, the initial state of + * all ptes is not-present. Therefore we never need to unfold or + * otherwise invalidate a range before we set the new ptes. + * contpte_set_ptes() should never be called for nr < 2. + */ + VM_WARN_ON(nr == 1); + + if (!mm_is_user(mm)) + return __set_ptes(mm, addr, ptep, pte, nr); + + end = addr + (nr << PAGE_SHIFT); + pfn = pte_pfn(pte); + prot = pte_pgprot(pte); + + do { + next = pte_cont_addr_end(addr, end); + nr = (next - addr) >> PAGE_SHIFT; + pte = pfn_pte(pfn, prot); + + if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) + pte = pte_mkcont(pte); + else + pte = pte_mknoncont(pte); + + __set_ptes(mm, addr, ptep, pte, nr); + + addr = next; + ptep += nr; + pfn += nr; + + } while (addr != end); +} +EXPORT_SYMBOL(contpte_set_ptes); + +void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + __clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_clear_full_ptes); + +pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_get_and_clear_full_ptes); + +int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + /* + * ptep_clear_flush_young() technically requires us to clear the access + * flag for a _single_ pte. However, the core-mm code actually tracks + * access/dirty per folio, not per page. And since we only create a + * contig range when the range is covered by a single folio, we can get + * away with clearing young for the whole contig range here, so we avoid + * having to unfold. + */ + + int young = 0; + int i; + + ptep = contpte_align_down(ptep); + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + young |= __ptep_test_and_clear_young(vma, addr, ptep); + + return young; +} +EXPORT_SYMBOL(contpte_ptep_test_and_clear_young); + +int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + + if (young) { + /* + * See comment in __ptep_clear_flush_young(); same rationale for + * eliding the trailing DSB applies here. + */ + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, + PAGE_SIZE, true, 3); + } + + return young; +} +EXPORT_SYMBOL(contpte_ptep_clear_flush_young); + +void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * If wrprotecting an entire contig range, we can avoid unfolding. Just + * set wrprotect and wait for the later mmu_gather flush to invalidate + * the tlb. Until the flush, the page may or may not be wrprotected. + * After the flush, it is guaranteed wrprotected. If it's a partial + * range though, we must unfold, because we can't have a case where + * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this + * would cause it to continue to be unpredictable after the flush. + */ + + contpte_try_unfold_partial(mm, addr, ptep, nr); + __wrprotect_ptes(mm, addr, ptep, nr); +} +EXPORT_SYMBOL(contpte_wrprotect_ptes); + +int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + unsigned long start_addr; + pte_t orig_pte; + int i; + + /* + * Gather the access/dirty bits for the contiguous range. If nothing has + * changed, its a noop. + */ + orig_pte = pte_mknoncont(ptep_get(ptep)); + if (pte_val(orig_pte) == pte_val(entry)) + return 0; + + /* + * We can fix up access/dirty bits without having to unfold the contig + * range. But if the write bit is changing, we must unfold. + */ + if (pte_write(orig_pte) == pte_write(entry)) { + /* + * For HW access management, we technically only need to update + * the flag on a single pte in the range. But for SW access + * management, we need to update all the ptes to prevent extra + * faults. Avoid per-page tlb flush in __ptep_set_access_flags() + * and instead flush the whole range at the end. + */ + ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + __ptep_set_access_flags(vma, addr, ptep, entry, 0); + + if (dirty) + __flush_tlb_range(vma, start_addr, addr, + PAGE_SIZE, true, 3); + } else { + __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); + __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + } + + return 1; +} +EXPORT_SYMBOL(contpte_ptep_set_access_flags); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c376e58e8cf039f6321259956314b04199cb2f3e..7d304e0132b7d5e097c924fde8b9e7ea9e4035fc 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -217,7 +217,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -231,16 +231,16 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * like __set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ -int ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index c0a3301203bdf7070c33a60d126a40b556686c1e..bfc02568805aea999138fef0ef63c983c2fffaa9 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx, ptep = fixmap_pte(addr); if (pgprot_val(flags)) { - set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); + __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 13fd592228b188658bd25011b66fd0420ca44d66..b1e8d8f8dae4e169fe531e8c68969a41d0aeaa05 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -152,14 +152,14 @@ pte_t huge_ptep_get(pte_t *ptep) { int ncontig, i; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); for (i = 0; i < ncontig; i++, ptep++) { - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); @@ -184,11 +184,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on @@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - ptep_clear(mm, addr, ptep); + __ptep_get_and_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } @@ -254,12 +254,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_pte_at(mm, addr, ptep, pte); + __set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_pte_at(mm, addr, ptep, pte); + __set_ptes(mm, addr, ptep, pte, 1); return; } @@ -270,7 +270,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -400,7 +400,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + __pte_clear(mm, addr, ptep); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -408,10 +408,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_cont(orig_pte)) - return ptep_get_and_clear(mm, addr, ptep); + return __ptep_get_and_clear(mm, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -431,11 +431,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(ptep_get(ptep))) + if (pte_write(pte) != pte_write(__ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = ptep_get(ptep + i); + pte_t orig_pte = __ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; @@ -459,7 +459,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; if (!pte_cont(pte)) - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; @@ -478,7 +478,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -492,8 +492,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(READ_ONCE(*ptep))) { - ptep_set_wrprotect(mm, addr, ptep); + if (!pte_cont(__ptep_get(ptep))) { + __ptep_set_wrprotect(mm, addr, ptep); return; } @@ -507,7 +507,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -517,7 +517,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) + if (!pte_cont(__ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -551,7 +551,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index f17d066e85eb854a06a8949da954ff704137746f..28856f511fb638189e8392fff89c2f8e29426ab2 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, if (!early) memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; - set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep))); + __set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, @@ -266,7 +266,7 @@ static void __init kasan_init_shadow(void) * so we should make sure that it maps the zero page read-only. */ for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(&kasan_early_shadow_pte[i], + __set_pte(&kasan_early_shadow_pte[i], pfn_pte(sym_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO)); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 58d228de4808144f1903978aa8dbffd4dfc12c73..84fb680f69cea9cc015d1965f6292d859d497f01 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -176,16 +176,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = READ_ONCE(*ptep); + pte_t old_pte = __ptep_get(ptep); - set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - READ_ONCE(pte_val(*ptep)))); + pte_val(__ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -859,12 +859,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); if (pte_none(pte)) continue; WARN_ON(!pte_present(pte)); - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pte_page(pte), @@ -992,7 +992,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -1011,7 +1011,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(READ_ONCE(ptep[i]))) + if (!pte_none(__ptep_get(&ptep[i]))) return; } @@ -1481,7 +1481,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); } return ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 924843f1f661bfe1ff5c6b8f9eff753872416040..0c4e3ecf989d434ae96b6620e72448c57e9eeb67 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,12 +36,12 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(READ_ONCE(*ptep)); + return pte_valid(__ptep_get(ptep)); } diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 7b14df3c64776f4c61a81aa85af317344e741df2..5139a28130c0888555395d574c2b1373e840f014 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = READ_ONCE(*src_ptep); + pte_t pte = __ptep_get(src_ptep); if (pte_valid(pte)) { /* @@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_ptep, pte_mkwrite_novma(pte)); + __set_pte(dst_ptep, pte_mkwrite_novma(pte)); } else if ((debug_pagealloc_enabled() || is_kfence_address((void *)addr)) && !pte_none(pte)) { /* @@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); + __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); } } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e02b179ec6598900d56dec3196287f46b7818a5a..4a2e6e8c6fe592fccbc2d8a8cf111054f13e96a6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -955,13 +955,13 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } -static inline pte_t pte_next_pfn(pte_t pte) +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { if (__pte_needs_invert(pte_val(pte))) - return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT)); - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } -#define pte_next_pfn pte_next_pfn +#define pte_advance_pfn pte_advance_pfn static inline int pte_present(pte_t a) { diff --git a/include/linux/efi.h b/include/linux/efi.h index 4e89a4aac75aef90f52f84c5fc1160fe051d836b..9ed79128458c928eb30ea04567847cd329a61021 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -697,6 +697,11 @@ extern struct efi { extern struct mm_struct efi_mm; +static inline bool mm_is_efi(struct mm_struct *mm) +{ + return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm; +} + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8b9e4afe2e3553a09a25f064b5f1a3a67c58f795..87cd26a480383619545c579f09b4c410d490bfac 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -205,14 +205,36 @@ static inline int pmd_young(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif -#ifndef pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) + #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. @@ -222,6 +244,10 @@ static inline pte_t pte_next_pfn(pte_t pte) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2073693b3aa7428db77ec20d3328e454fe022068..cd1eb575f15d01cdb5b044a007fed6cd0bc54978 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2408,15 +2408,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - if (freeze || pmd_migration) { + + /* + * Note that NUMA hinting access restrictions are not transferred to + * avoid any possibility of altering permissions across VMAs. + */ + if (freeze || pmd_migration) { + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry; swp_entry_t swp_entry; + if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -2435,25 +2436,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); - } else { - entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - if (write) - entry = pte_mkwrite(entry, vma); - if (!young) - entry = pte_mkold(entry); - /* NOTE: this may set soft-dirty too on some archs */ - if (dirty) - entry = pte_mkdirty(entry); - if (soft_dirty) - entry = pte_mksoft_dirty(entry); - if (uffd_wp) - entry = pte_mkuffd_wp(entry); + + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); } - VM_BUG_ON(!pte_none(ptep_get(pte))); - set_pte_at(mm, addr, pte, entry); - pte++; + } else { + pte_t entry; + + entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); + if (write) + entry = pte_mkwrite(entry, vma); + if (!young) + entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + for (i = 0; i < HPAGE_PMD_NR; i++) + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); } - pte_unmap(pte - 1); + pte_unmap(pte); if (!pmd_migration) folio_remove_rmap_pmd(folio, page, vma); diff --git a/mm/internal.h b/mm/internal.h index ef68eddaa5d36e63f644580843f4cba97908f42d..68c7d528b9dbafe37279ed0da60808bdd7327db1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -83,6 +83,99 @@ static inline void *folio_raw_mapping(struct folio *folio) return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } +#ifdef CONFIG_MMU + +/* Flags for folio_pte_batch(). */ +typedef int __bitwise fpb_t; + +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ +#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) + +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ +#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) +{ + if (flags & FPB_IGNORE_DIRTY) + pte = pte_mkclean(pte); + if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + pte = pte_clear_soft_dirty(pte); + return pte_wrprotect(pte_mkold(pte)); +} + +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @addr: The user virtual address the first page is mapped at. + * @start_ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * @flags: Flags to modify the PTE batch semantics. + * @any_writable: Optional pointer to indicate whether any entry except the + * first one is writable. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * + * start_ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, + bool *any_writable) +{ + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte, *ptep; + bool writable; + int nr; + + if (any_writable) + *any_writable = false; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + if (any_writable) + writable = !!pte_write(pte); + pte = __pte_batch_clear_ignored(pte, flags); + + if (!pte_same(pte, expected_pte)) + break; + + /* + * Stop immediately once we reached the end of the folio. In + * corner cases the next PFN might fall into a different + * folio. + */ + if (pte_pfn(pte) >= folio_end_pfn) + break; + + if (any_writable) + *any_writable |= writable; + + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); + ptep += nr; + } + + return min(ptep - start_ptep, max_nr); +} +#endif /* CONFIG_MMU */ + void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) diff --git a/mm/memory.c b/mm/memory.c index 6a81a75f3884db63cbcb651df2e0f4aef67004e3..83f030d6818d6375bfac1941a6f107c0b0905c85 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -958,77 +958,6 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } -/* Flags for folio_pte_batch(). */ -typedef int __bitwise fpb_t; - -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) - -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) - -static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) -{ - if (flags & FPB_IGNORE_DIRTY) - pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) - pte = pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); -} - -/* - * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same folio. - * - * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). - * - * If "any_writable" is set, it will indicate if any other PTE besides the - * first (given) PTE is writable. - */ -static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) -{ - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags); - pte_t *ptep = start_ptep + 1; - bool writable; - - if (any_writable) - *any_writable = false; - - VM_WARN_ON_FOLIO(!pte_present(pte), folio); - - while (ptep != end_ptep) { - pte = ptep_get(ptep); - if (any_writable) - writable = !!pte_write(pte); - pte = __pte_batch_clear_ignored(pte, flags); - - if (!pte_same(pte, expected_pte)) - break; - - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) == folio_end_pfn) - break; - - if (any_writable) - *any_writable |= writable; - - expected_pte = pte_next_pfn(expected_pte); - ptep++; - } - - return ptep - start_ptep; -} - /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. @@ -1114,12 +1043,17 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 1; } -static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *folio_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (need_zero) + new_folio = vma_alloc_zeroed_movable_folio(vma, addr); + else + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr, false); + if (!new_folio) return NULL; @@ -1259,7 +1193,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } else if (ret == -EBUSY) { goto out; } else if (ret == -EAGAIN) { - prealloc = page_copy_prealloc(src_mm, src_vma, addr); + prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret < 0) { @@ -3321,6 +3255,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; + bool pfn_is_zero; delayacct_wpcopy_start(); @@ -3330,16 +3265,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (unlikely(ret)) goto out; - if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); - if (!new_folio) - goto oom; - } else { + pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); + new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); + if (!new_folio) + goto oom; + + if (!pfn_is_zero) { int err; - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); - if (!new_folio) - goto oom; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { @@ -3360,10 +3292,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) - goto oom_free_new; - folio_throttle_swaprate(new_folio, GFP_KERNEL); - __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, @@ -3465,8 +3393,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; -oom_free_new: - folio_put(new_folio); oom: ret = VM_FAULT_OOM; out: @@ -4363,8 +4289,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4416,15 +4342,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4491,10 +4423,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4549,8 +4477,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } @@ -4966,6 +4892,7 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); @@ -4974,16 +4901,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret) return ret; - vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); - if (!vmf->cow_page) + folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); + if (!folio) return VM_FAULT_OOM; - if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, - GFP_KERNEL)) { - put_page(vmf->cow_page); - return VM_FAULT_OOM; - } - folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL); + vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -4992,7 +4914,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) return ret; copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); - __SetPageUptodate(vmf->cow_page); + __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock_page(vmf->page); @@ -5001,7 +4923,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - put_page(vmf->cow_page); + folio_put(folio); return ret; } diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 1c5606cc33346bd29b523deb466cc739a0c6bb33..7bb03606b9eaa2e3763962f240dfb653d2b6b8f7 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -3,7 +3,8 @@ # include ../scripts/Makefile.include -TARGETS=page-types slabinfo page_owner_sort +BUILD_TARGETS=page-types slabinfo page_owner_sort +INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a @@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a CFLAGS += -Wall -Wextra -I../lib/ -pthread LDFLAGS += $(LIBS) -pthread -all: $(TARGETS) +all: $(BUILD_TARGETS) -$(TARGETS): $(LIBS) +$(BUILD_TARGETS): $(LIBS) $(LIBS): make -C $(LIB_DIR) @@ -29,4 +30,4 @@ sbindir ?= /usr/sbin install: all install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) + install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps new file mode 100644 index 0000000000000000000000000000000000000000..803e0318f2fea1c5cc19e036316c6d00fb407783 --- /dev/null +++ b/tools/mm/thpmaps @@ -0,0 +1,675 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2024 ARM Ltd. +# +# Utility providing smaps-like output detailing transparent hugepage usage. +# For more info, run: +# ./thpmaps --help +# +# Requires numpy: +# pip3 install numpy + + +import argparse +import collections +import math +import os +import re +import resource +import shutil +import sys +import textwrap +import time +import numpy as np + + +with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f: + PAGE_SIZE = resource.getpagesize() + PAGE_SHIFT = int(math.log2(PAGE_SIZE)) + PMD_SIZE = int(f.read()) + PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE)) + + +def align_forward(v, a): + return (v + (a - 1)) & ~(a - 1) + + +def align_offset(v, a): + return v & (a - 1) + + +def kbnr(kb): + # Convert KB to number of pages. + return (kb << 10) >> PAGE_SHIFT + + +def nrkb(nr): + # Convert number of pages to KB. + return (nr << PAGE_SHIFT) >> 10 + + +def odkb(order): + # Convert page order to KB. + return (PAGE_SIZE << order) >> 10 + + +def cont_ranges_all(search, index): + # Given a list of arrays, find the ranges for which values are monotonically + # incrementing in all arrays. all arrays in search and index must be the + # same size. + sz = len(search[0]) + r = np.full(sz, 2) + d = np.diff(search[0]) == 1 + for dd in [np.diff(arr) == 1 for arr in search[1:]]: + d &= dd + r[1:] -= d + r[:-1] -= d + return [np.repeat(arr, r).reshape(-1, 2) for arr in index] + + +class ArgException(Exception): + pass + + +class FileIOException(Exception): + pass + + +class BinArrayFile: + # Base class used to read /proc//pagemap and /proc/kpageflags into a + # numpy array. Use inherrited class in a with clause to ensure file is + # closed when it goes out of scope. + def __init__(self, filename, element_size): + self.element_size = element_size + self.filename = filename + self.fd = os.open(self.filename, os.O_RDONLY) + + def cleanup(self): + os.close(self.fd) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + + def _readin(self, offset, buffer): + length = os.preadv(self.fd, (buffer,), offset) + if len(buffer) != length: + raise FileIOException('error: {} failed to read {} bytes at {:x}' + .format(self.filename, len(buffer), offset)) + + def _toarray(self, buf): + assert(self.element_size == 8) + return np.frombuffer(buf, dtype=np.uint64) + + def getv(self, vec): + vec *= self.element_size + offsets = vec[:, 0] + lengths = (np.diff(vec) + self.element_size).reshape(len(vec)) + buf = bytearray(int(np.sum(lengths))) + view = memoryview(buf) + pos = 0 + for offset, length in zip(offsets, lengths): + offset = int(offset) + length = int(length) + self._readin(offset, view[pos:pos+length]) + pos += length + return self._toarray(buf) + + def get(self, index, nr=1): + offset = index * self.element_size + length = nr * self.element_size + buf = bytearray(length) + self._readin(offset, buf) + return self._toarray(buf) + + +PM_PAGE_PRESENT = 1 << 63 +PM_PFN_MASK = (1 << 55) - 1 + +class PageMap(BinArrayFile): + # Read ranges of a given pid's pagemap into a numpy array. + def __init__(self, pid='self'): + super().__init__(f'/proc/{pid}/pagemap', 8) + + +KPF_ANON = 1 << 12 +KPF_COMPOUND_HEAD = 1 << 15 +KPF_COMPOUND_TAIL = 1 << 16 +KPF_THP = 1 << 22 + +class KPageFlags(BinArrayFile): + # Read ranges of /proc/kpageflags into a numpy array. + def __init__(self): + super().__init__(f'/proc/kpageflags', 8) + + +vma_all_stats = set([ + "Size", + "Rss", + "Pss", + "Pss_Dirty", + "Shared_Clean", + "Shared_Dirty", + "Private_Clean", + "Private_Dirty", + "Referenced", + "Anonymous", + "KSM", + "LazyFree", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", + "Shared_Hugetlb", + "Private_Hugetlb", + "Swap", + "SwapPss", + "Locked", +]) + +vma_min_stats = set([ + "Rss", + "Anonymous", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", +]) + +VMA = collections.namedtuple('VMA', [ + 'name', + 'start', + 'end', + 'read', + 'write', + 'execute', + 'private', + 'pgoff', + 'major', + 'minor', + 'inode', + 'stats', +]) + +class VMAList: + # A container for VMAs, parsed from /proc//smaps. Iterate over the + # instance to receive VMAs. + def __init__(self, pid='self', stats=[]): + self.vmas = [] + with open(f'/proc/{pid}/smaps', 'r') as file: + for line in file: + elements = line.split() + if '-' in elements[0]: + start, end = map(lambda x: int(x, 16), elements[0].split('-')) + major, minor = map(lambda x: int(x, 16), elements[3].split(':')) + self.vmas.append(VMA( + name=elements[5] if len(elements) == 6 else '', + start=start, + end=end, + read=elements[1][0] == 'r', + write=elements[1][1] == 'w', + execute=elements[1][2] == 'x', + private=elements[1][3] == 'p', + pgoff=int(elements[2], 16), + major=major, + minor=minor, + inode=int(elements[4], 16), + stats={}, + )) + else: + param = elements[0][:-1] + if param in stats: + value = int(elements[1]) + self.vmas[-1].stats[param] = {'type': None, 'value': value} + + def __iter__(self): + yield from self.vmas + + +def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the mapped THPs. + stats = { + 'file': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + 'anon': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + } + + for rindex, rpfn in zip(ranges[0], ranges[2]): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + pfn_end = int(rpfn[1]) + 1 + + folios = indexes[index_next:index_end][heads[index_next:index_end]] + + # Account pages for any partially mapped THP at the front. In that case, + # the first page of the range is a tail. + nr = (int(folios[0]) if len(folios) else index_end) - index_next + stats['anon' if anons[index_next] else 'file']['partial'] += nr + + # Account pages for any partially mapped THP at the back. In that case, + # the next page after the range is a tail. + if len(folios): + flags = int(kpageflags.get(pfn_end)[0]) + if flags & KPF_COMPOUND_TAIL: + nr = index_end - int(folios[-1]) + folios = folios[:-1] + index_end -= nr + stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr + + # Account fully mapped THPs in the middle of the range. + if len(folios): + folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1])) + folio_orders = np.log2(folio_nrs).astype(np.uint64) + for index, order in zip(folios, folio_orders): + index = int(index) + order = int(order) + nr = 1 << order + vfn = int(vfns[index]) + align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned' + anon = 'anon' if anons[index] else 'file' + stats[anon][align][order] += nr + + # Account PMD-mapped THPs spearately, so filter out of the stats. There is a + # race between acquiring the smaps stats and reading pagemap, where memory + # could be deallocated. So clamp to zero incase it would have gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped)) + stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + def flatten_sub(type, subtype, stats): + param = f"{type}-thp-pte-{subtype}-{{}}kB" + for od, nr in enumerate(stats[2:], 2): + rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)} + + def flatten_type(type, stats): + flatten_sub(type, 'aligned', stats['aligned']) + flatten_sub(type, 'unaligned', stats['unaligned']) + rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])} + + flatten_type('anon', stats['anon']) + flatten_type('file', stats['file']) + + return rstats + + +def cont_parse(vma, order, ranges, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the contiguous blocks. + nr_cont = 1 << order + nr_anon = 0 + nr_file = 0 + + for rindex, rvfn, rpfn in zip(*ranges): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + vfn_start = int(rvfn[0]) + pfn_start = int(rpfn[0]) + + if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): + continue + + off = align_forward(vfn_start, nr_cont) - vfn_start + index_next += off + + while index_next + nr_cont <= index_end: + folio_boundary = heads[index_next+1:index_next+nr_cont].any() + if not folio_boundary: + if anons[index_next]: + nr_anon += nr_cont + else: + nr_file += nr_cont + index_next += nr_cont + + # Account blocks that are PMD-mapped spearately, so filter out of the stats. + # There is a race between acquiring the smaps stats and reading pagemap, + # where memory could be deallocated. So clamp to zero incase it would have + # gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped)) + nr_file = max(0, nr_file - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)} + rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)} + + return rstats + + +def vma_print(vma, pid): + # Prints a VMA instance in a format similar to smaps. The main difference is + # that the pid is included as the first value. + print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}" + .format( + pid, vma.start, vma.end, + 'r' if vma.read else '-', 'w' if vma.write else '-', + 'x' if vma.execute else '-', 'p' if vma.private else 's', + vma.pgoff, vma.major, vma.minor, vma.inode, vma.name + )) + + +def stats_print(stats, tot_anon, tot_file, inc_empty): + # Print a statistics dictionary. + label_field = 32 + for label, stat in stats.items(): + type = stat['type'] + value = stat['value'] + if value or inc_empty: + pad = max(0, label_field - len(label) - 1) + if type == 'anon' and tot_anon > 0: + percent = f' ({value / tot_anon:3.0%})' + elif type == 'file' and tot_file > 0: + percent = f' ({value / tot_file:3.0%})' + else: + percent = '' + print(f"{label}:{' ' * pad}{value:8} kB{percent}") + + +def vma_parse(vma, pagemap, kpageflags, contorders): + # Generate thp and cont statistics for a single VMA. + start = vma.start >> PAGE_SHIFT + end = vma.end >> PAGE_SHIFT + + pmes = pagemap.get(start, end - start) + present = pmes & PM_PAGE_PRESENT != 0 + pfns = pmes & PM_PFN_MASK + pfns = pfns[present] + vfns = np.arange(start, end, dtype=np.uint64) + vfns = vfns[present] + + pfn_vec = cont_ranges_all([pfns], [pfns])[0] + flags = kpageflags.getv(pfn_vec) + anons = flags & KPF_ANON != 0 + heads = flags & KPF_COMPOUND_HEAD != 0 + thps = flags & KPF_THP != 0 + + vfns = vfns[thps] + pfns = pfns[thps] + anons = anons[thps] + heads = heads[thps] + + indexes = np.arange(len(vfns), dtype=np.uint64) + ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns]) + + thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads) + contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders] + + tot_anon = vma.stats['Anonymous']['value'] + tot_file = vma.stats['Rss']['value'] - tot_anon + + return { + **thpstats, + **{k: v for s in contstats for k, v in s.items()} + }, tot_anon, tot_file + + +def do_main(args): + pids = set() + rollup = {} + rollup_anon = 0 + rollup_file = 0 + + if args.cgroup: + strict = False + for walk_info in os.walk(args.cgroup): + cgroup = walk_info[0] + with open(f'{cgroup}/cgroup.procs') as pidfile: + for line in pidfile.readlines(): + pids.add(int(line.strip())) + elif args.pid: + strict = True + pids = pids.union(args.pid) + else: + strict = False + for pid in os.listdir('/proc'): + if pid.isdigit(): + pids.add(int(pid)) + + if not args.rollup: + print(" PID START END PROT OFFSET DEV INODE OBJECT") + + for pid in pids: + try: + with PageMap(pid) as pagemap: + with KPageFlags() as kpageflags: + for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): + if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0: + stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) + else: + stats = {} + vma_anon = 0 + vma_file = 0 + if args.inc_smaps: + stats = {**vma.stats, **stats} + if args.rollup: + for k, v in stats.items(): + if k in rollup: + assert(rollup[k]['type'] == v['type']) + rollup[k]['value'] += v['value'] + else: + rollup[k] = v + rollup_anon += vma_anon + rollup_file += vma_file + else: + vma_print(vma, pid) + stats_print(stats, vma_anon, vma_file, args.inc_empty) + except (FileNotFoundError, ProcessLookupError, FileIOException): + if strict: + raise + + if args.rollup: + stats_print(rollup, rollup_anon, rollup_file, args.inc_empty) + + +def main(): + docs_width = shutil.get_terminal_size().columns + docs_width -= 2 + docs_width = min(80, docs_width) + + def format(string): + text = re.sub(r'\s+', ' ', string) + text = re.sub(r'\s*\\n\s*', '\n', text) + paras = text.split('\n') + paras = [textwrap.fill(p, width=docs_width) for p in paras] + return '\n'.join(paras) + + def formatter(prog): + return argparse.RawDescriptionHelpFormatter(prog, width=docs_width) + + def size2order(human): + units = { + "K": 2**10, "M": 2**20, "G": 2**30, + "k": 2**10, "m": 2**20, "g": 2**30, + } + unit = 1 + if human[-1] in units: + unit = units[human[-1]] + human = human[:-1] + try: + size = int(human) + except ValueError: + raise ArgException('error: --cont value must be integer size with optional KMG unit') + size *= unit + order = int(math.log2(size / PAGE_SIZE)) + if order < 1: + raise ArgException('error: --cont value must be size of at least 2 pages') + if (1 << order) * PAGE_SIZE != size: + raise ArgException('error: --cont value must be size of power-of-2 pages') + if order > PMD_ORDER: + raise ArgException('error: --cont value must be less than or equal to PMD order') + return order + + parser = argparse.ArgumentParser(formatter_class=formatter, + description=format("""Prints information about how transparent huge + pages are mapped, either system-wide, or for a specified + process or cgroup.\\n + \\n + When run with --pid, the user explicitly specifies the set + of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run + with --cgroup, the user passes either a v1 or v2 cgroup and + all pids that belong to the cgroup subtree are scanned. When + run with neither --pid nor --cgroup, the full set of pids on + the system is gathered from /proc and scanned as if the user + had provided "--pid 1 --pid 2 ...".\\n + \\n + A default set of statistics is always generated for THP + mappings. However, it is also possible to generate + additional statistics for "contiguous block mappings" where + the block size is user-defined.\\n + \\n + Statistics are maintained independently for anonymous and + file-backed (pagecache) memory and are shown both in kB and + as a percentage of either total anonymous or total + file-backed memory as appropriate.\\n + \\n + THP Statistics\\n + --------------\\n + \\n + Statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is aligned to + their size, for each supported by the system. + Separate counters describe THPs mapped by PTE vs those + mapped by PMD. (Although note a THP can only be mapped by + PMD if it is PMD-sized):\\n + \\n + - anon-thp-pte-aligned-kB\\n + - file-thp-pte-aligned-kB\\n + - anon-thp-pmd-aligned-kB\\n + - file-thp-pmd-aligned-kB\\n + \\n + Similarly, statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is *not* + aligned to their size, for each supported by the + system. Due to the unaligned mapping, it is impossible to + map by PMD, so there are only PTE counters for this case:\\n + \\n + - anon-thp-pte-unaligned-kB\\n + - file-thp-pte-unaligned-kB\\n + \\n + Statistics are also always generated for mapped pages that + belong to a THP but where the is THP is *not* fully- and + contiguously- mapped. These "partial" mappings are all + counted in the same counter regardless of the size of the + THP that is partially mapped:\\n + \\n + - anon-thp-pte-partial\\n + - file-thp-pte-partial\\n + \\n + Contiguous Block Statistics\\n + ---------------------------\\n + \\n + An optional, additional set of statistics is generated for + every contiguous block size specified with `--cont `. + These statistics show how much memory is mapped in + contiguous blocks of and also aligned to . A + given contiguous block must all belong to the same THP, but + there is no requirement for it to be the *whole* THP. + Separate counters describe contiguous blocks mapped by PTE + vs those mapped by PMD:\\n + \\n + - anon-cont-pte-aligned-kB\\n + - file-cont-pte-aligned-kB\\n + - anon-cont-pmd-aligned-kB\\n + - file-cont-pmd-aligned-kB\\n + \\n + As an example, if monitoring 64K contiguous blocks (--cont + 64K), there are a number of sources that could provide such + blocks: a fully- and contiguously-mapped 64K THP that is + aligned to a 64K boundary would provide 1 block. A fully- + and contiguously-mapped 128K THP that is aligned to at least + a 64K boundary would provide 2 blocks. Or a 128K THP that + maps its first 100K, but contiguously and starting at a 64K + boundary would provide 1 block. A fully- and + contiguously-mapped 2M THP would provide 32 blocks. There + are many other possible permutations.\\n"""), + epilog=format("""Requires root privilege to access pagemap and + kpageflags.""")) + + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--pid', + metavar='pid', required=False, type=int, default=[], action='append', + help="""Process id of the target process. Maybe issued multiple times to + scan multiple processes. --pid and --cgroup are mutually exclusive. + If neither are provided, all processes are scanned to provide + system-wide information.""") + + group.add_argument('--cgroup', + metavar='path', required=False, + help="""Path to the target cgroup in sysfs. Iterates over every pid in + the cgroup and its children. --pid and --cgroup are mutually + exclusive. If neither are provided, all processes are scanned to + provide system-wide information.""") + + parser.add_argument('--rollup', + required=False, default=False, action='store_true', + help="""Sum the per-vma statistics to provide a summary over the whole + system, process or cgroup.""") + + parser.add_argument('--cont', + metavar='size[KMG]', required=False, default=[], action='append', + help="""Adds stats for memory that is mapped in contiguous blocks of + and also aligned to . May be issued multiple times to + track multiple sized blocks. Useful to infer e.g. arm64 contpte and + hpa mappings. Size must be a power-of-2 number of pages.""") + + parser.add_argument('--inc-smaps', + required=False, default=False, action='store_true', + help="""Include all numerical, additive /proc//smaps stats in the + output.""") + + parser.add_argument('--inc-empty', + required=False, default=False, action='store_true', + help="""Show all statistics including those whose value is 0.""") + + parser.add_argument('--periodic', + metavar='sleep_ms', required=False, type=int, + help="""Run in a loop, polling every sleep_ms milliseconds.""") + + args = parser.parse_args() + + try: + args.cont = [size2order(cont) for cont in args.cont] + except ArgException as e: + parser.print_usage() + raise + + if args.periodic: + while True: + do_main(args) + print() + time.sleep(args.periodic / 1000) + else: + do_main(args) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + prog = os.path.basename(sys.argv[0]) + print(f'{prog}: {e}') + exit(1)