diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 91965fb043de0b3b08fb927b853010b58cfa28d8..4c64d90155c10aea6ed757aaa85c553a6048b37f 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -118,6 +118,19 @@ static inline bool is_write_fault(unsigned int fsr) return (fsr & FSR_WRITE) && !(fsr & FSR_CM); } +static inline bool is_translation_fault(unsigned int fsr) +{ + int fs = fsr_fs(fsr); +#ifdef CONFIG_ARM_LPAE + if ((fs & FS_MMU_NOLL_MASK) == FS_TRANS_NOLL) + return true; +#else + if (fs == FS_L1_TRANS || fs == FS_L2_TRANS) + return true; +#endif + return false; +} + static void die_kernel_fault(const char *msg, struct mm_struct *mm, unsigned long addr, unsigned int fsr, struct pt_regs *regs) @@ -153,7 +166,8 @@ __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) + if (is_translation_fault(fsr) && + kfence_handle_page_fault(addr, is_write_fault(fsr), regs)) return; msg = "paging request"; @@ -221,7 +235,7 @@ static inline bool is_permission_fault(unsigned int fsr) { int fs = fsr_fs(fsr); #ifdef CONFIG_ARM_LPAE - if ((fs & FS_PERM_NOLL_MASK) == FS_PERM_NOLL) + if ((fs & FS_MMU_NOLL_MASK) == FS_PERM_NOLL) return true; #else if (fs == FS_L1_PERM || fs == FS_L2_PERM) diff --git a/arch/arm/mm/fault.h b/arch/arm/mm/fault.h index 83b5ab32d7a488e9a16552f49bacc86c2f2b56c2..54927ba1fa6ede55ce78ee2da53ab1ef8149eed6 100644 --- a/arch/arm/mm/fault.h +++ b/arch/arm/mm/fault.h @@ -14,8 +14,9 @@ #ifdef CONFIG_ARM_LPAE #define FSR_FS_AEA 17 +#define FS_TRANS_NOLL 0x4 #define FS_PERM_NOLL 0xC -#define FS_PERM_NOLL_MASK 0x3C +#define FS_MMU_NOLL_MASK 0x3C static inline int fsr_fs(unsigned int fsr) { @@ -23,8 +24,10 @@ static inline int fsr_fs(unsigned int fsr) } #else #define FSR_FS_AEA 22 -#define FS_L1_PERM 0xD -#define FS_L2_PERM 0xF +#define FS_L1_TRANS 0x5 +#define FS_L2_TRANS 0x7 +#define FS_L1_PERM 0xD +#define FS_L2_PERM 0xF static inline int fsr_fs(unsigned int fsr) { diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 34b009cd600454a9d77c5c36a29037e62c94cf51..20c6f7f5176179f717b10da6a7a9b302eefa0351 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -112,6 +112,10 @@ #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_FAULT_L0 (0x04) +#define ESR_ELx_FSC_FAULT_L1 (0x05) +#define ESR_ELx_FSC_FAULT_L2 (0x06) +#define ESR_ELx_FSC_FAULT_L3 (0x07) #define ESR_ELx_FSC_PERM (0x0C) /* ISS field definitions for Data Aborts */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4e7e047d6fb92bf72e52bd42225bbe79edad6d08..f914c30b74871974176a9239ab4b4c23b09f1527 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1038,6 +1038,12 @@ static inline pgprot_t arch_filter_pgprot(pgprot_t prot) return PAGE_READONLY_EXEC; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep); +#define vmemmap_update_pmd vmemmap_update_pmd +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte); +#define vmemmap_update_pte vmemmap_update_pte +#endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 4c28c6c4acba9b3a53cb8ddb2faf8cb2be529bd3..353ec955915ea19211c1e44ce9720f942ea0c35d 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -441,6 +441,22 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ish); isb(); } + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline void vmemmap_flush_tlb_all(void) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all + +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range +#endif + #endif #endif diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9bd10909861add20fe8598caf84c4fe56c4ba2cc..a2c61725c176cd898fac65097536337a68af7e1a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -299,6 +299,90 @@ static void die_kernel_fault(const char *msg, unsigned long addr, do_exit(SIGKILL); } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + if (!hugetlb_optimize_vmemmap_enabled()) + return false; + + if (addr < VMEMMAP_START || addr >= VMEMMAP_END) + return false; + + /* + * Only try to handle translation fault level 2 or level 3, + * because hugetlb vmemmap optimize only clear pmd or pte. + */ + switch (esr & ESR_ELx_FSC) { + case ESR_ELx_FSC_FAULT_L2: + case ESR_ELx_FSC_FAULT_L3: + return true; + default: + return false; + } +} + +/* + * PMD mapped vmemmap should has been split as PTE mapped + * by HVO now, here we only check this case, other cases + * should fail. + * Also should check the addr is healthy enough that will not cause + * a level2 or level3 translation fault again after page fault + * handled with success, so we need check both bits[1:0] of PMD and + * PTE as ARM Spec mentioned below: + * A Translation fault is generated if bits[1:0] of a translation + * table descriptor identify the descriptor as either a Fault + * encoding or a reserved encoding. + */ +static inline bool vmemmap_addr_healthy(unsigned long addr) +{ + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pmdp = pmd_off_k(addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_table(pmd)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = ptep_get(ptep); + return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE; +} + +static bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + bool ret; + unsigned long flags; + + if (likely(!vmemmap_fault_may_fixup(addr, esr))) + return false; + + spin_lock_irqsave(&init_mm.page_table_lock, flags); + ret = vmemmap_addr_healthy(addr); + spin_unlock_irqrestore(&init_mm.page_table_lock, flags); + + return ret; +} +#else +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + return false; +} + +static inline bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ + +static bool is_translation_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; +} + static void __do_kernel_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -311,9 +395,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return; - if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), - "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) + if (is_spurious_el1_translation_fault(addr, esr, regs)) { + WARN_RATELIMIT(!vmemmap_fault_may_fixup(addr, esr), + "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); return; + } if (is_el1_permission_fault(addr, esr, regs)) { if (esr & ESR_ELx_WNR) @@ -325,8 +411,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (vmemmap_handle_page_fault(addr, esr)) + return; + } msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 095c192c729b390e2e0d4ca1de877d79d7aa8aa6..adaca1fd5a277b52c7043e60f6d2e7dc6262c302 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1148,6 +1148,35 @@ static void free_empty_tables(unsigned long addr, unsigned long end, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +/* + * In the window between the page table entry is cleared and filled + * with a new value, other threads have the opportunity to concurrently + * access the vmemmap area then page translation fault occur. + * Therefore, we need to ensure that the init_mm.page_table_lock is held + * to synchronize the vmemmap page fault handling which will wait for + * this lock to be released to ensure that the page table entry has been + * refreshed with a new valid value. + */ +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep) +{ + lockdep_assert_held(&init_mm.page_table_lock); + pmd_clear(pmdp); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); +} + +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte) +{ + spin_lock_irq(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + set_pte_at(&init_mm, addr, ptep, pte); + spin_unlock_irq(&init_mm.page_table_lock); +} +#endif + #if !ARM64_SWAPPER_USES_SECTION_MAPS int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 6803c89c5d21669003d86b41d1ec96c4806d320e..a47b027af1f73c8d29912d702ff98d4d0099279a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -54,6 +54,37 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; +#ifndef vmemmap_update_pmd +static inline void vmemmap_update_pmd(unsigned long addr, + pmd_t *pmdp, pte_t *ptep) +{ + pmd_populate_kernel(&init_mm, pmdp, ptep); +} +#endif + +#ifndef vmemmap_update_pte +static inline void vmemmap_update_pte(unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(&init_mm, addr, ptep, pte); +} +#endif + +#ifndef vmemmap_flush_tlb_all +static inline void vmemmap_flush_tlb_all(void) +{ + flush_tlb_all(); +} +#endif + +#ifndef vmemmap_flush_tlb_range +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + flush_tlb_kernel_range(start, end); +} +#endif + static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; @@ -76,7 +107,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) set_pte_at(&init_mm, addr, pte, entry); } - spin_lock(&init_mm.page_table_lock); + spin_lock_irq(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to @@ -88,12 +119,12 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in __pte_alloc(). */ smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + vmemmap_update_pmd(start, pmd, pgtable); + vmemmap_flush_tlb_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } - spin_unlock(&init_mm.page_table_lock); + spin_unlock_irq(&init_mm.page_table_lock); return 0; } @@ -221,7 +252,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + vmemmap_flush_tlb_range(start, end); return 0; } @@ -269,15 +300,15 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, /* * Makes sure that preceding stores to the page contents from - * vmemmap_remap_free() become visible before the set_pte_at() - * write. + * vmemmap_remap_free() become visible before the + * vmemmap_update_pte() write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); + vmemmap_update_pte(addr, pte, entry); } /* @@ -315,7 +346,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + vmemmap_update_pte(addr, pte, mk_pte(page, pgprot)); } /**