diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 890461d3f5e5dddc4cd936fc3afc588563d0d751..2712d3c7371950de0a363a0aea5883d6654d7f65 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -104,6 +104,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if !PREEMPT_RT select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index ae35939f395bb18e2e1c8862046edd8379113985..1c63256efd2503a1146a213e2e394ed4ca6e36b3 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -116,6 +116,10 @@ #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_FAULT_L0 (0x04) +#define ESR_ELx_FSC_FAULT_L1 (0x05) +#define ESR_ELx_FSC_FAULT_L2 (0x06) +#define ESR_ELx_FSC_FAULT_L3 (0x07) #define ESR_ELx_FSC_PERM (0x0C) #define ESR_ELx_FSC_SEA_TTW0 (0x14) #define ESR_ELx_FSC_SEA_TTW1 (0x15) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 401087e8a43dc08621a0a60ca6d3fe431649bb4e..b72196ed5a76ffb7c3cb65251043495c3f4ac845 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1453,6 +1453,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, #endif /* CONFIG_ARM64_CONTPTE */ +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep); +#define vmemmap_update_pmd vmemmap_update_pmd +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte); +#define vmemmap_update_pte vmemmap_update_pte +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 2e7a9b70ea6e1d2f495bc75e078c13a94572413a..046bc64be6d073f1bc80f9635263490d5a332c36 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -488,6 +488,22 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ish); isb(); } + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline void vmemmap_flush_tlb_all(void) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all + +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range +#endif + #endif #endif diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 5d040bbfd247981a0d4c88f4076bb1381d4534d7..4ea07caba71cf26db93de00e22baf387186733b9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -394,6 +394,85 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) return false; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + if (!static_branch_likely(&hugetlb_optimize_vmemmap_key)) + return false; + + if (addr < VMEMMAP_START || addr >= VMEMMAP_END) + return false; + + /* + * Only try to handle translation fault level 2 or level 3, + * because hugetlb vmemmap optimize only clear pmd or pte. + */ + switch (esr & ESR_ELx_FSC) { + case ESR_ELx_FSC_FAULT_L2: + case ESR_ELx_FSC_FAULT_L3: + return true; + default: + return false; + } +} + +/* + * PMD mapped vmemmap should has been split as PTE mapped + * by HVO now, here we only check this case, other cases + * should fail. + * Also should check the addr is healthy enough that will not cause + * a level2 or level3 translation fault again after page fault + * handled with success, so we need check both bits[1:0] of PMD and + * PTE as ARM Spec mentioned below: + * A Translation fault is generated if bits[1:0] of a translation + * table descriptor identify the descriptor as either a Fault + * encoding or a reserved encoding. + */ +static inline bool vmemmap_addr_healthy(unsigned long addr) +{ + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pmdp = pmd_off_k(addr); + pmd = pmdp_get(pmdp); + if (!pmd_table(pmd)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = ptep_get(ptep); + return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE; +} + +static bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + bool ret; + unsigned long flags; + + if (likely(!vmemmap_fault_may_fixup(addr, esr))) + return false; + + spin_lock_irqsave(&init_mm.page_table_lock, flags); + ret = vmemmap_addr_healthy(addr); + spin_unlock_irqrestore(&init_mm.page_table_lock, flags); + + return ret; +} +#else +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + return false; +} + +static inline bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ + static bool is_translation_fault(unsigned long esr) { return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; @@ -411,9 +490,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return; - if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), - "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) + if (is_spurious_el1_translation_fault(addr, esr, regs)) { + WARN_RATELIMIT(!vmemmap_fault_may_fixup(addr, esr), + "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); return; + } if (is_el1_mte_sync_tag_check_fault(esr)) { do_tag_recovery(addr, esr, regs); @@ -431,9 +512,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && - kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (vmemmap_handle_page_fault(addr, esr)) + return; + } msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 84fb680f69cea9cc015d1965f6292d859d497f01..4142a75a414ec510f9bcb4304c95daed7bafaaca 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1151,6 +1151,34 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, return 1; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +/* + * In the window between the page table entry is cleared and filled + * with a new value, other threads have the opportunity to concurrently + * access the vmemmap area then page translation fault occur. + * Therefore, we need to ensure that the init_mm.page_table_lock is held + * to synchronize the vmemmap page fault handling which will wait for + * this lock to be released to ensure that the page table entry has been + * refreshed with a new valid value. + */ +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep) +{ + lockdep_assert_held(&init_mm.page_table_lock); + pmd_clear(pmdp); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); +} + +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte) +{ + spin_lock_irq(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + set_pte_at(&init_mm, addr, ptep, pte); + spin_unlock_irq(&init_mm.page_table_lock); +} +#endif + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index beaae51033db3b08246a25f7f855b09a14504a0b..fb0b05d4659ab68b4d5d0f2692302c7a0e7eb0f1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -37,6 +37,37 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; +#ifndef vmemmap_update_pmd +static inline void vmemmap_update_pmd(unsigned long addr, + pmd_t *pmdp, pte_t *ptep) +{ + pmd_populate_kernel(&init_mm, pmdp, ptep); +} +#endif + +#ifndef vmemmap_update_pte +static inline void vmemmap_update_pte(unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(&init_mm, addr, ptep, pte); +} +#endif + +#ifndef vmemmap_flush_tlb_all +static inline void vmemmap_flush_tlb_all(void) +{ + flush_tlb_all(); +} +#endif + +#ifndef vmemmap_flush_tlb_range +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + flush_tlb_kernel_range(start, end); +} +#endif + static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; @@ -67,7 +98,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) set_pte_at(&init_mm, addr, pte, entry); } - spin_lock(&init_mm.page_table_lock); + spin_lock_irq(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to @@ -79,12 +110,12 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + vmemmap_update_pmd(start, pmd, pgtable); + vmemmap_flush_tlb_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } - spin_unlock(&init_mm.page_table_lock); + spin_unlock_irq(&init_mm.page_table_lock); return 0; } @@ -198,7 +229,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + vmemmap_flush_tlb_range(start, end); return 0; } @@ -244,15 +275,15 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, /* * Makes sure that preceding stores to the page contents from - * vmemmap_remap_free() become visible before the set_pte_at() - * write. + * vmemmap_remap_free() become visible before the + * vmemmap_update_pte() write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); + vmemmap_update_pte(addr, pte, entry); } /* @@ -291,10 +322,10 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, /* * Makes sure that preceding stores to the page contents become visible - * before the set_pte_at() write. + * before the vmemmap_update_pte() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + vmemmap_update_pte(addr, pte, mk_pte(page, pgprot)); } /**