diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fc6053a500dbaaf0c8a470274ea6df352e7c8437..28de20ca2d7172d390e63e6f51cc07f64f3ee3f7 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1212,6 +1212,7 @@ CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y +# CONFIG_KERNEL_REPLICATION is not set CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set CONFIG_PAGE_CACHE_LIMIT=y diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index bcd5622aa09686a9d256af624b2a92abc3e6fee0..7bb99c95845573ae250b3f15e06f04040116d67d 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -87,8 +87,6 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) static inline unsigned long efi_get_kimg_min_align(void) { - extern bool efi_nokaslr; - /* * Although relocatable kernels can fix up the misalignment with * respect to MIN_KIMG_ALIGN, the resulting virtual text addresses are @@ -97,7 +95,23 @@ static inline unsigned long efi_get_kimg_min_align(void) * 2M alignment if KASLR was explicitly disabled, even if it was not * going to be activated to begin with. */ + +#ifdef CONFIG_KERNEL_REPLICATION + /* If kernel replication is enabled, the special alignment is necessary. + * Due to this fact for now we map kernel by huge pages even + * in case of KASLR enabled. Ugly but works. + */ +#ifdef CONFIG_ARM64_4K_PAGES + return HPAGE_SIZE; +#else + return CONT_PTE_SIZE; +#endif + +#else + extern bool efi_nokaslr; + return efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; +#endif } #define EFI_ALLOC_ALIGN SZ_64K diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a6fb325424e7a990313dc6419a198155e5205b5c..0ddefc286d2644a851dd3d48ad32dd27e37399cb 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -162,7 +162,11 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap) /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); +#ifdef CONFIG_KERNEL_REPLICATION + if (system_supports_cnp() && !WARN_ON(pgdp != init_mm.pgd_numa[numa_node_id()])) { +#else if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) { +#endif /* CONFIG_KERNEL_REPLICATION */ /* * cpu_replace_ttbr1() is used when there's a boot CPU * up (i.e. cpufeature framework is not up yet) and diff --git a/arch/arm64/include/asm/numa_replication.h b/arch/arm64/include/asm/numa_replication.h new file mode 100644 index 0000000000000000000000000000000000000000..7b515c7d41981d42af327eb93f52e903234ba49a --- /dev/null +++ b/arch/arm64/include/asm/numa_replication.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_NUMA_REPLICATION_H +#define __ASM_NUMA_REPLICATION_H + +#ifdef CONFIG_KERNEL_REPLICATION +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_TABLE_REPLICATION_LEFT ((max((u64)_end - SZ_2G, (u64)MODULES_VADDR)) & PGDIR_MASK) +#define PAGE_TABLE_REPLICATION_RIGHT ((((u64)_end + SZ_2G) & PGDIR_MASK) + PGDIR_SIZE - 1) + +static inline pgd_t *numa_replicate_pgt_pgd(int nid) +{ + pgd_t *new_pgd; + struct page *pgd_page; + + pgd_page = alloc_pages_node(nid, GFP_PGTABLE_KERNEL, 2); + BUG_ON(pgd_page == NULL); + + new_pgd = (pgd_t *)page_address(pgd_page); + new_pgd += (PAGE_SIZE * 2 / sizeof(pgd_t)); //Extra pages for KPTI + copy_page(new_pgd, swapper_pg_dir); + + return new_pgd; +} + +static inline void numa_load_replicated_pgd(pgd_t *pgd) +{ + cpu_replace_ttbr1(pgd, idmap_pg_dir); + local_flush_tlb_all(); +} + +static inline ssize_t numa_cpu_dump(struct seq_file *m) +{ + seq_printf(m, "NODE: #%02d, CPU: #%04d, ttbr1_el1: 0x%p, COMM: %s\n", + numa_node_id(), + smp_processor_id(), + (void *)read_sysreg(ttbr1_el1), + current->group_leader->comm); + return 0; +} + +static inline void numa_sync_text_replicas(unsigned long start, unsigned long end) +{ + caches_clean_inval_pou(start, end); + icache_inval_all_pou(); +} +#endif /* CONFIG_KERNEL_REPLICATION */ +#endif /* __ASM_NUMA_REPLICATION_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d457dd74f534cf814536bc8d9c2c0855bbd41caf..b30e43b84a64e5135ff5d29d6ee22fc9edf86220 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -21,7 +21,11 @@ * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space * and fixed mappings */ +#ifdef CONFIG_KERNEL_REPLICATION +#define VMALLOC_START ((MODULES_END & PGDIR_MASK) + PGDIR_SIZE) +#else /* !CONFIG_KERNEL_REPLICATION */ #define VMALLOC_START (MODULES_END) +#endif /* CONFIG_KERNEL_REPLICATION */ #define VMALLOC_END (VMEMMAP_START - SZ_256M) #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) @@ -537,6 +541,15 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#ifdef CONFIG_KERNEL_REPLICATION +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8ff6610af49664f01ceef2a5e8f6fae52e861a43..680f5efa285842258eaa727c13ff1aeb0f50a497 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -139,6 +140,30 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } +static void __nocfi __write_alternatives(struct alt_instr *alt, + alternative_cb_t alt_cb, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (is_text_replicated() && is_kernel_text((unsigned long)origptr)) { + int nid; + + for_each_memory_node(nid) { + __le32 *ptr = numa_get_replica(origptr, nid); + + alt_cb(alt, origptr, ptr, nr_inst); + clean_dcache_range_nopatch((u64)ptr, + (u64)(ptr + nr_inst)); + } + + return; + } +#endif /* CONFIG_KERNEL_REPLICATION */ + alt_cb(alt, origptr, updptr, nr_inst); +} + + static void __apply_alternatives(const struct alt_region *region, bool is_module, unsigned long *cpucap_mask) @@ -171,7 +196,7 @@ static void __apply_alternatives(const struct alt_region *region, else alt_cb = patch_alternative; - alt_cb(alt, origptr, updptr, nr_inst); + __write_alternatives(alt, alt_cb, origptr, updptr, nr_inst); if (!is_module) { clean_dcache_range_nopatch((u64)origptr, diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a1736e9044dad67b67158a0c2e4c8340a8ab9974..82477a3603c9bf5072d98ef386ded1a9cdb7723e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -3677,7 +3678,11 @@ subsys_initcall_sync(init_32bit_el0_mask); static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { +#ifdef CONFIG_KERNEL_REPLICATION + cpu_replace_ttbr1(this_node_pgd(&init_mm), idmap_pg_dir); +#else cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); +#endif /* CONFIG_KERNEL_REPLICATION */ } /* diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271ed35ccbbb7a3dad71399502974946..bd943537a5796df9dd6d95ca8907db79e675f53a 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -113,7 +114,11 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) return -EOVERFLOW; arch_hdr_invariants(&hdr->invariants); +#ifdef CONFIG_KERNEL_REPLICATION + hdr->ttbr1_el1 = virt_to_phys(this_node_pgd(&init_mm)); +#else hdr->ttbr1_el1 = __pa_symbol(swapper_pg_dir); +#endif /* CONFIG_KERNEL_REPLICATION */ hdr->reenter_kernel = _cpu_resume; /* We can't use __hyp_get_vectors() because kvm may still be loaded */ diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index dd851297596e5e9372bf4f64f640283181e86395..ab17508b0372f2f90e3ce8c383d703b2a3e2cf20 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -110,7 +110,7 @@ static int __init module_init_limits(void) } subsys_initcall(module_init_limits); -void *module_alloc(unsigned long size) +static void *__module_alloc(unsigned long size, unsigned long vm_flags, int nid) { void *p = NULL; @@ -123,7 +123,7 @@ void *module_alloc(unsigned long size) module_direct_base, module_direct_base + SZ_128M, GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, + PAGE_KERNEL, vm_flags, nid, __builtin_return_address(0)); } @@ -132,7 +132,7 @@ void *module_alloc(unsigned long size) module_plt_base, module_plt_base + SZ_2G, GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, + PAGE_KERNEL, vm_flags, nid, __builtin_return_address(0)); } @@ -150,6 +150,36 @@ void *module_alloc(unsigned long size) return kasan_reset_tag(p); } +#ifdef CONFIG_KERNEL_REPLICATION +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, VM_NUMA_SHARED, NUMA_NO_NODE); +} + +void *module_alloc_replica(unsigned long size) +{ + return __module_alloc(size, VM_NUMA_SHARED, first_memory_node); +} + +void module_replicate(void *ptr) +{ + gfp_t gfp_mask = GFP_KERNEL; + + __vmalloc_node_replicate_range(ptr, gfp_mask, + PAGE_KERNEL, 0); +} +#else +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, 0, NUMA_NO_NODE); +} + +void *module_alloc_replica(unsigned long size) +{ + return module_alloc(size); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + enum aarch64_reloc_op { RELOC_OP_NONE, RELOC_OP_ABS, diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index b4835f6d594bc5554f3b99228748bff48b7771b1..b690a93ecaf491b32f8074a860551f4ac633705e 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ static DEFINE_RAW_SPINLOCK(patch_lock); +#ifndef CONFIG_KERNEL_REPLICATION static bool is_exit_text(unsigned long addr) { /* discarded with init text/data */ @@ -41,10 +43,22 @@ static void __kprobes *patch_map(void *addr, int fixmap) else return addr; + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} +#else +static void __kprobes *patch_map(void *addr, int fixmap, int nid) +{ + unsigned long uintaddr = (uintptr_t) addr; + struct page *page; + + page = walk_to_page_node(nid, addr); BUG_ON(!page); + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + (uintaddr & ~PAGE_MASK)); } +#endif /* CONFIG_KERNEL_REPLICATION */ static void __kprobes patch_unmap(int fixmap) { @@ -66,6 +80,28 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp) return ret; } +#ifdef CONFIG_KERNEL_REPLICATION +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + int nid; + void *waddr = addr; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + for_each_memory_node(nid) { + waddr = patch_map(addr, FIX_TEXT_POKE0, nid); + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + patch_unmap(FIX_TEXT_POKE0); + + if (ret || !is_text_replicated()) + break; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} +#else static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) { void *waddr = addr; @@ -82,12 +118,34 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) return ret; } +#endif /* CONFIG_KERNEL_REPLICATION */ int __kprobes aarch64_insn_write(void *addr, u32 insn) { return __aarch64_insn_write(addr, cpu_to_le32(insn)); } +#ifdef CONFIG_KERNEL_REPLICATION +noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) +{ + int nid; + u64 *waddr; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + for_each_memory_node(nid) { + waddr = patch_map(addr, FIX_TEXT_POKE0, nid); + + ret = copy_to_kernel_nofault(waddr, &val, sizeof(val)); + + patch_unmap(FIX_TEXT_POKE0); + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} +#else noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) { u64 *waddr; @@ -104,6 +162,7 @@ noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) return ret; } +#endif /* CONFIG_KERNEL_REPLICATION */ int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) { diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 2aa5129d825377c980517a9b94db63f2020d4d58..b4bcc43393abb5d73f86a4257828e8cc47960838 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -5,6 +5,10 @@ #include #include +#ifdef CONFIG_KERNEL_REPLICATION +.extern numa_setup_pgd +#endif + .text /* * Implementation of MPIDR_EL1 hash algorithm through shifting @@ -147,6 +151,10 @@ SYM_FUNC_START(_cpu_resume) bl kasan_unpoison_task_stack_below #endif +#ifdef CONFIG_KERNEL_REPLICATION + bl numa_setup_pgd +#endif + ldp x19, x20, [x29, #16] ldp x21, x22, [x29, #32] ldp x23, x24, [x29, #48] diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 50f6576f1b310fc6556f179337070065d7637eba..2cadcfd15814d35cdac0de90ce4fe9e54d4417a1 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -209,6 +210,13 @@ asmlinkage notrace void secondary_start_kernel(void) mmgrab(mm); current->active_mm = mm; + /* + * Setup per-NUMA node page table if kernel + * replication is enabled. Option supported + * only for 64-bit mode. + */ + numa_setup_pgd(); + /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 045af2bfd656a067a946288e51431339505df2c4..67969ca05f4a1e3a1810ac32b6b8e3a47f9559e6 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -5,6 +5,8 @@ #include #include #include +#include + #include #include #include @@ -56,7 +58,11 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) +#ifdef CONFIG_KERNEL_REPLICATION + cpu_replace_ttbr1(this_node_pgd(&init_mm), idmap_pg_dir); +#else cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); +#endif /* CONFIG_KERNEL_REPLICATION */ /* * PSTATE was not saved over suspend/resume, re-enable any detected diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 3cd7e76cc56266c4ce928b8bb968a3d347c5e511..0c58e121af9c4f45a2d3daba731c0477c243b93e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -170,6 +170,13 @@ SECTIONS _text = .; HEAD_TEXT } +#ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_ARM64_4K_PAGES + . = ALIGN(PMD_SIZE); +#else + . = ALIGN(CONT_PTE_SIZE); +#endif +#endif .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ IRQENTRY_TEXT @@ -184,10 +191,25 @@ SECTIONS } . = ALIGN(SEGMENT_ALIGN); +#ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_ARM64_4K_PAGES + . = ALIGN(PMD_SIZE); +#else + . = ALIGN(CONT_PTE_SIZE); +#endif +#endif _etext = .; /* End of text section */ /* everything from this point to __init_begin will be marked RO NX */ +#ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_ARM64_4K_PAGES + RO_DATA(PMD_SIZE) +#else + RO_DATA(CONT_PTE_SIZE) +#endif +#else RO_DATA(PAGE_SIZE) +#endif HYPERVISOR_DATA_SECTIONS diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 188197590fc9ce44da04246455571fc9ee47c57c..ae3e3dc0d2fc239779753b9ddf6b36a0f95536cc 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -267,7 +268,7 @@ void check_and_switch_context(struct mm_struct *mm) * emulating PAN. */ if (!system_uses_ttbr0_pan()) - cpu_switch_mm(mm->pgd, mm); + cpu_switch_mm(this_node_pgd(mm), mm); } unsigned long arm64_mm_context_get(struct mm_struct *mm) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 66a7fff9f3736f7c3e0fa7cccd1369fcf60e6257..8d955787e030f8860c71cae8852d77b963b6c237 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -597,6 +597,47 @@ void __init bootmem_init(void) memblock_dump_all(); } +#ifdef CONFIG_KERNEL_REPLICATION +/* + * It is necessary to preallocate vmalloc pages in advance, + * otherwise the replicated page-tables can be incomplete. + */ +void __init preallocate_vmalloc_pages(void) +{ + unsigned long addr; + + for (addr = MODULES_VADDR; addr <= VMALLOC_END; + addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + int pte; + + p4d = p4d_alloc(&init_mm, pgd, addr); + /* + * No need to check p4d here due to + * only 4-stage page table is possible + */ + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + panic("Failed to pre-allocate pud pages for vmalloc area\n"); + if (!mm_pud_folded(&init_mm)) + continue; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + panic("Failed to pre-allocate pmd pages for vmalloc area\n"); + if (!mm_pmd_folded(&init_mm)) + continue; + + pte = pte_alloc(&init_mm, pmd); + if (pte) + panic("Failed to pre-allocate pte pages for vmalloc area\n"); + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * mem_init() marks the free areas in the mem_map and tells us how much memory * is free. This is done after various parts of the system have claimed their @@ -651,7 +692,15 @@ void free_initmem(void) * prevents the region from being reused for kernel modules, which * is not supported by kallsyms. */ +#ifdef CONFIG_KERNEL_REPLICATION + /* + * In case of replicated kernel the per-NUMA node vmalloc + * memory should be released. + */ + vunmap_range_replicas((u64)__init_begin, (u64)__init_end); +#else vunmap_range((u64)__init_begin, (u64)__init_end); +#endif /* CONFIG_KERNEL_REPLICATION */ } void dump_mem_limit(void) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 28856f511fb638189e8392fff89c2f8e29426ab2..dc783796c60a4bc95cc63d86d5d62e8ce581f6e1 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,24 @@ static phys_addr_t __init kasan_alloc_raw_page(int node) return __pa(p); } +static void __init __kasan_pmd_populate(pmd_t *pmdp, phys_addr_t pte_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (get_propagation_level() == PMD_PROPAGATION) { + int nid; + pmd_t *target; + + for_each_memory_node(nid) { + target = (pmd_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __pmd_populate(target, pte_phys, PMD_TYPE_TABLE); + } + } else + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); +#else + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) { @@ -66,13 +85,31 @@ static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, phys_addr_t pte_phys = early ? __pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node); - __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); + __kasan_pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); } return early ? pte_offset_kimg(pmdp, addr) : pte_offset_kernel(pmdp, addr); } +static void __init __kasan_pud_populate(pud_t *pudp, phys_addr_t pmd_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (get_propagation_level() == PUD_PROPAGATION) { + int nid; + pud_t *target; + + for_each_memory_node(nid) { + target = (pud_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __pud_populate(target, pmd_phys, PMD_TYPE_TABLE); + } + } else + __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); +#else + __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) { @@ -80,12 +117,30 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, phys_addr_t pmd_phys = early ? __pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node); - __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE); + __kasan_pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE); } return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } +static void __init __kasan_p4d_populate(p4d_t *p4dp, phys_addr_t pud_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (get_propagation_level() == P4D_PROPAGATION) { + int nid; + p4d_t *target; + + for_each_memory_node(nid) { + target = (p4d_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __p4d_populate(target, pud_phys, PMD_TYPE_TABLE); + } + } else + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); +#else + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { @@ -93,7 +148,7 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); - __p4d_populate(p4dp, pud_phys, P4D_TYPE_TABLE); + __kasan_p4d_populate(p4dp, pud_phys, addr); } return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); @@ -245,7 +300,14 @@ static void __init kasan_init_shadow(void) kasan_populate_early_shadow(kasan_mem_to_shadow((void *)PAGE_END), (void *)mod_shadow_start); +#ifdef CONFIG_KERNEL_REPLICATION + /* + * If Kernel replication is enabled, + * VMALLOC_START != MODULES_END + */ +#else BUILD_BUG_ON(VMALLOC_START != MODULES_END); +#endif /* CONFIG_KERNEL_REPLICATION */ kasan_populate_early_shadow((void *)vmalloc_shadow_end, (void *)KASAN_SHADOW_END); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c846cc54e9cea6f786044ef12af97daf80067d78..f19d8b8ab382918b90bf10af55d64168e30d7e9b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -477,6 +478,23 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgd_pgtable_alloc, flags); } +static void populate_mappings_prot(phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + for_each_memory_node(nid) { + __create_pgd_mapping(per_node_pgd(&init_mm, nid), + page_to_phys(walk_to_page_node(nid, (void *)virt)), + virt, size, prot, NULL, NO_CONT_MAPPINGS); + } +#else + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static void update_mapping_prot(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { @@ -486,8 +504,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + populate_mappings_prot(phys, virt, size, prot); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -676,6 +693,22 @@ static pgprot_t kernel_exec_prot(void) } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + +#ifdef CONFIG_KERNEL_REPLICATION +static void __init populate_trampoline_mappings(void) +{ + int nid; + + /* Copy trampoline mappings in replicated tables */ + for_each_memory_node(nid) { + memcpy(per_node_pgd(&init_mm, nid) - (PAGE_SIZE * 2 / sizeof(pgd_t)), + tramp_pg_dir, PGD_SIZE); + } + /* Be sure that replicated page table can be observed properly */ + dsb(ishst); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static int __init map_entry_trampoline(void) { int i; @@ -701,6 +734,10 @@ static int __init map_entry_trampoline(void) __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO); +#ifdef CONFIG_KERNEL_REPLICATION + populate_trampoline_mappings(); +#endif /* CONFIG_KERNEL_REPLICATION */ + return 0; } core_initcall(map_entry_trampoline); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 06e81d1dbc1e968d89e46e55728f744441fc053b..d2e6cc2dc301a4b71b9a489c1e73cc1fdcffdab6 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -64,6 +64,24 @@ static int __change_memory_common(unsigned long start, unsigned long size, return ret; } +#ifdef CONFIG_KERNEL_REPLICATION +static int __change_memory_common_replicas(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range_replicas(&init_mm, start, size, + change_page_range, &data); + + flush_tlb_kernel_range(start, start + size); + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static int change_memory_common(unsigned long addr, int numpages, pgprot_t set_mask, pgprot_t clear_mask) { @@ -122,6 +140,20 @@ static int change_memory_common(unsigned long addr, int numpages, return __change_memory_common(start, size, set_mask, clear_mask); } +#ifdef CONFIG_KERNEL_REPLICATION +static int numa_change_memory_common(unsigned long addr, int numpages, + pgprot_t set_mask, pgprot_t clear_mask) +{ + int ret; + + ret = change_memory_common(addr, numpages, set_mask, clear_mask); + if (ret) + return ret; + + return __change_memory_common_replicas(addr, numpages * PAGE_SIZE, set_mask, clear_mask); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + int set_memory_ro(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, @@ -150,6 +182,36 @@ int set_memory_x(unsigned long addr, int numpages) __pgprot(PTE_PXN)); } +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_x(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_MAYBE_GP), + __pgprot(PTE_PXN)); +} + +int numa_set_memory_nx(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_PXN), + __pgprot(PTE_MAYBE_GP)); +} + +int numa_set_memory_ro(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_RDONLY), + __pgprot(PTE_WRITE)); +} + +int numa_set_memory_rw(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_WRITE), + __pgprot(PTE_RDONLY)); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + int set_memory_valid(unsigned long addr, int numpages, int enable) { if (enable) diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 4a64089e5771c1e2fd06448fe6c0edfb7f5ab635..adf642eba4e7f1f9ef2d1c32b8539b7928dc037f 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,81 @@ static struct kmem_cache *pgd_cache __ro_after_init; +#ifdef CONFIG_KERNEL_REPLICATION +pgd_t *page_pgd_alloc(struct mm_struct *mm) +{ + int nid; + gfp_t gfp = GFP_PGTABLE_USER | __GFP_THISNODE; + /* + * Kernel replication is not supproted in case of non-page size pgd, + * in general we can support it, but maybe later, due to we need to + * update page tables allocation significantly, so, let's panic here. + */ + for_each_memory_node(nid) { + struct page *page; + + page = alloc_pages_node(nid, gfp, 0); + if (!page) + goto fail; + + WARN_ON_ONCE(page_to_nid(page) != nid); + + per_node_pgd(mm, nid) = (pgd_t *)page_address(page); + } + + for_each_online_node(nid) + per_node_pgd(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid)); + + mm->pgd = per_node_pgd(mm, numa_get_memory_node(0));/*!!!*/ + + return mm->pgd; + +fail: + pgd_free(mm, mm->pgd); + + return NULL; +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + + if (!pgd_numa) + return NULL; + + mm->pgd_numa = pgd_numa; + + return page_pgd_alloc(mm); +} + +static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + int nid; + /* + * Kernel replication is not supproted in case of non-page size pgd, + * in general we can support it, but maybe later, due to we need to + * update page tables allocation significantly, so, let's panic here. + */ + for_each_memory_node(nid) { + if (per_node_pgd(mm, nid) == NULL) + break; + WARN_ON_ONCE(page_to_nid(virt_to_page(per_node_pgd(mm, nid))) != nid); + free_page((unsigned long)per_node_pgd(mm, nid)); + } + + for_each_online_node(nid) + per_node_pgd(mm, nid) = NULL; + +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + page_pgd_free(mm, pgd); + + kfree(mm->pgd_numa); +} + +#else /* !CONFIG_KERNEL_REPLICATION */ pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; @@ -34,6 +110,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) else kmem_cache_free(pgd_cache, pgd); } +#endif /* CONFIG_KERNEL_REPLICATION */ void __init pgtable_cache_init(void) { diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index e305b6593c4e234c7e1e24d3fcf9872e54089967..833e753671c26345829b1e79f9d581dae1c5f663 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -345,7 +346,7 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = PAGE_OFFSET, }; -void ptdump_check_wx(void) +static void ptdump_check_wx_pgd(struct mm_struct *mm, pgd_t *pgd) { struct pg_state st = { .seq = NULL, @@ -364,7 +365,7 @@ void ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + ptdump_walk_pgd(&st.ptdump, mm, pgd); if (st.wx_pages || st.uxn_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", @@ -373,6 +374,18 @@ void ptdump_check_wx(void) pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } +void ptdump_check_wx(void) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + for_each_memory_node(nid) + ptdump_check_wx_pgd(&init_mm, per_node_pgd(&init_mm, nid)); +#else + ptdump_check_wx_pgd(&init_mm, init_mm->pgd); +#endif +} + static int __init ptdump_init(void) { address_markers[PAGE_END_NR].start_address = PAGE_END; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 76ae4a3131babc7f2a1cde6b41964c137389d0e7..b5b1ffb4a919c3697ea3323a033c98aa180314db 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2251,10 +2251,10 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, /* non-zero plt_target indicates we're patching a bpf prog, * which is read only. */ - if (set_memory_rw(PAGE_MASK & ((uintptr_t)&plt->target), 1)) + if (numa_set_memory_rw(PAGE_MASK & ((uintptr_t)&plt->target), 1)) return -EFAULT; WRITE_ONCE(plt->target, plt_target); - set_memory_ro(PAGE_MASK & ((uintptr_t)&plt->target), 1); + numa_set_memory_ro(PAGE_MASK & ((uintptr_t)&plt->target), 1); /* since plt target points to either the new trampoline * or dummy_tramp, even if another CPU reads the old plt * target value before fetching the bl instruction to plt, diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 83092d93f36a63087ffbd8b6460d38a824e9cbb1..728fdb6bb52d54ede782c2f96508fb008edf0cbd 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,18 @@ device_initcall(ptdump_init); #endif +#ifdef CONFIG_KERNEL_REPLICATION +static void populate_efi_pgd(struct mm_struct *efi_mm) +{ + int nid; + + for_each_memory_node(nid) + memcpy(per_node_pgd(efi_mm, nid), efi_mm->pgd, PGD_SIZE); + dsb(ishst); + isb(); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; @@ -73,7 +86,9 @@ static bool __init efi_virtmap_init(void) return false; } } - +#ifdef CONFIG_KERNEL_REPLICATION + populate_efi_pgd(&efi_mm); +#endif if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) return false; diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index c75d4a753849398f3c0bae8d60104c7ba3849bd0..a8b7b343a4ed35ea407d33237f9c778dc7f05412 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -76,6 +76,24 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) return ptdesc_page(ptdesc); } +#ifdef CONFIG_KERNEL_REPLICATION +static inline pgtable_t __pte_alloc_one_node(unsigned int nid, + struct mm_struct *mm, gfp_t gfp) +{ + struct page *pte; + + pte = alloc_pages_node(nid, gfp, 0); + if (!pte) + return NULL; + if (!pagetable_pte_ctor(page_ptdesc(pte))) { + __free_page(pte); + return NULL; + } + + return pte; +} +#endif + #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table @@ -89,6 +107,15 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pgtable_t pte_alloc_one_node(unsigned int nid, + struct mm_struct *mm) +{ + return __pte_alloc_one_node(nid, mm, GFP_PGTABLE_USER | __GFP_THISNODE); +} +#endif + #endif /* @@ -140,6 +167,30 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) } return ptdesc_address(ptdesc); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pmd_t *pmd_alloc_one_node(unsigned int nid, + struct mm_struct *mm, + unsigned long addr) +{ + struct ptdesc *ptdesc; + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + gfp |= __GFP_THISNODE; + + ptdesc = pagetable_alloc_node(nid, gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + return ptdesc_address(ptdesc); +} +#endif /* CONFIG_KERNEL_REPLICATION */ #endif #ifndef __HAVE_ARCH_PMD_FREE @@ -172,6 +223,25 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) return ptdesc_address(ptdesc); } +#ifdef CONFIG_KERNEL_REPLICATION +static inline pud_t *__pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, + unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + gfp |= __GFP_THISNODE; + ptdesc = pagetable_alloc_node(nid, gfp, 0); + if (!ptdesc) + return NULL; + return ptdesc_address(ptdesc); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate memory for a PUD-level page table @@ -186,6 +256,14 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one(mm, addr); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pud_t *pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + return __pud_alloc_one_node(nid, mm, addr); +} +#endif /* CONFIG_KERNEL_REPLICATION */ #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 03b7dae47dd43013ca78c0fd080d5fad50640bda..183fb002eae26efd7da582f77647ca00807a5292 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -48,6 +48,11 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) * inside the pgd, so has no extra memory associated with it. */ #define p4d_alloc_one(mm, address) NULL + +#ifdef CONFIG_KERNEL_REPLICATION +#define p4d_alloc_one_node(nid, mm, address) NULL +#endif + #define p4d_free(mm, x) do { } while (0) #define p4d_free_tlb(tlb, x, a) do { } while (0) diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 8ffd64e7a24cbb39a671b535122231db6c12abe0..2cc2d949196ef9d9ee0b9e92a02eee79cfd2f42f 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -60,6 +60,11 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) * inside the pud, so has no extra memory associated with it. */ #define pmd_alloc_one(mm, address) NULL + +#ifdef CONFIG_KERNEL_REPLICATION +#define pmd_alloc_one_node(nid, mm, address) NULL +#endif + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { } diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index eb70c6d7ceff2eb9eb935d49dcf626d4d6110ec9..d401a4ffc784494162725c2343d431b314e2963c 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -56,6 +56,11 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) * inside the p4d, so has no extra memory associated with it. */ #define pud_alloc_one(mm, address) NULL + +#ifdef CONFIG_KERNEL_REPLICATION +#define pud_alloc_one_node(nid, mm, address) NULL +#endif + #define pud_free(mm, x) do { } while (0) #define pud_free_tlb(tlb, x, a) do { } while (0) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b2d4f45a866b96b19aff3ae5800f0a482b2adc50..068d4194140180d6d67de276e26d15a79b7c7b68 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,11 @@ static inline struct page *alloc_page_vma(gfp_t gfp, extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); +#ifdef CONFIG_KERNEL_REPLICATION +extern unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, unsigned int order); +extern unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask); +#endif /* CONFIG_KERNEL_REPLICATION */ + void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e6ef9532fc3f78438c60dce7ee322311c0565fa..f706eed1a8b53dcfc7ad4cb8d461ba2abdfd75a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1174,6 +1174,8 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); +struct page *walk_to_page_node(int nid, const void *addr); + /* * Determine if an address is within the vmalloc range * @@ -2795,8 +2797,24 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, { return 0; } -#else + +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + return 0; +} +#endif + +#else /* !__PAGETABLE_P4D_FOLDED */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); + +#ifdef CONFIG_KERNEL_REPLICATION +int __p4d_alloc_node(unsigned int nid, struct mm_struct *mm, + pgd_t *pgd, unsigned long address); +#endif + #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) @@ -2805,12 +2823,27 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); +#ifdef CONFIG_KERNEL_REPLICATION +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address); +#endif /* CONFIG_KERNEL_REPLICATION */ static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) @@ -2833,12 +2866,27 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +#ifdef CONFIG_KERNEL_REPLICATION +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address); +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) @@ -2910,6 +2958,32 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline p4d_t *p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + return (unlikely(pgd_none(*pgd)) && __p4d_alloc_node(nid, mm, pgd, address)) ? + NULL : p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && __pud_alloc_node(nid, mm, p4d, address)) ? + NULL : pud_offset(p4d, address); +} + +static inline pmd_t *pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return (unlikely(pud_none(*pud)) && __pmd_alloc_node(nid, mm, pud, address)) ? + NULL : pmd_offset(pud, address); +} +#endif /* CONFIG_KERNEL_REPLICATION */ #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) @@ -2949,6 +3023,14 @@ static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) return page_ptdesc(page); } +static inline struct ptdesc *pagetable_alloc_node(int nid, gfp_t gfp, + unsigned int order) +{ + struct page *page = alloc_pages_node(nid, gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3281,6 +3363,9 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); extern void set_dma_reserve(unsigned long new_dma_reserve); extern void mem_init(void); +#ifdef CONFIG_KERNEL_REPLICATION +extern void preallocate_vmalloc_pages(void); +#endif extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); @@ -3707,6 +3792,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +#ifdef CONFIG_KERNEL_REPLICATION +int apply_to_page_range_replicas(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data); +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4b9a8723d3eb388f05eb125ac79cbc59896bbe3d..6a0015a55211c80e5bb57911567474820f9c4b40 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -981,7 +981,11 @@ struct mm_struct { #endif } __randomize_layout; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_USE(1, pgd_t **pgd_numa) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 6683ad87a0ff40191bec80e426058f9ba70f98f7..c2f5d2e33f2c73c2091557fb3065a9ad53d6cc1c 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,6 +29,12 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); sections. Returns NULL on failure. */ void *module_alloc(unsigned long size); +#ifdef CONFIG_KERNEL_REPLICATION +void *module_alloc_replica(unsigned long size); +/* Replicate memory allocated in previous function*/ +void module_replicate(void *ptr); +#endif /* CONFIG_KERNEL_REPLICATION */ + /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h new file mode 100644 index 0000000000000000000000000000000000000000..ee1ab0f111c7ad419cc2d1abcd1204f46f001800 --- /dev/null +++ b/include/linux/numa_kernel_replication.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_REPLICATION_H +#define _LINUX_NUMA_REPLICATION_H + +#ifdef CONFIG_KERNEL_REPLICATION + +#include + +/* + * Why? Because linux is defined to 1 for some reason, + * and linux/mm.h converted to 1/mm.h. Perhaps compiler? + * Do not ask me, I have no idea. + */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE() +#include KABI_HIDE_INCLUDE() +#include KABI_HIDE_INCLUDE() +#include KABI_HIDE_INCLUDE() +#include KABI_HIDE_INCLUDE() + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +typedef enum { + NONE = 0, + PMD_PROPAGATION = 1, + PUD_PROPAGATION = 2, + P4D_PROPAGATION = 3, + PGD_PROPAGATION = 4 +} propagation_level_t; + +extern nodemask_t replica_nodes; + +#define for_each_memory_node(nid) \ + for (nid = first_node(replica_nodes); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes)) + +#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) +#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) + +static inline bool numa_addr_has_replica(const void *addr) +{ + return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && + ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); +} + +void __init numa_replication_init(void); +void __init numa_replicate_kernel_text(void); +void numa_replicate_kernel_rodata(void); +void numa_replication_fini(void); + +bool is_text_replicated(void); +propagation_level_t get_propagation_level(void); +void numa_setup_pgd(void); +void __init_or_module *numa_get_replica(void *vaddr, int nid); +int numa_get_memory_node(int nid); +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end); + +/* Macro to walk over mm->pgd_numa and cast it to appropriate level type */ +#define for_each_pgtable_replica(table, mm, replica, nid, offset) \ + for (nid = first_node(replica_nodes), offset = ((unsigned long)table) & (~PAGE_MASK), \ + replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes), \ + replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset)) + +static inline void pgd_populate_replicated(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) +{ + int nid; + pgd_t *curr_pgd; + unsigned long offset; + + if (get_propagation_level() == PGD_PROPAGATION) { + for_each_pgtable_replica(pgdp, mm, curr_pgd, nid, offset) { + pgd_populate(mm, curr_pgd, p4dp); + } + } else { + pgd_populate(mm, pgdp, p4dp); + } +} + +static inline void p4d_populate_replicated(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) +{ + int nid; + p4d_t *curr_p4d; + unsigned long offset; + + if (get_propagation_level() == P4D_PROPAGATION) { + for_each_pgtable_replica(p4dp, mm, curr_p4d, nid, offset) { + p4d_populate(mm, curr_p4d, pudp); + } + } else { + p4d_populate(mm, p4dp, pudp); + } +} + +static inline void pud_populate_replicated(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) +{ + int nid; + pud_t *curr_pud; + unsigned long offset; + + if (get_propagation_level() == PUD_PROPAGATION) { + for_each_pgtable_replica(pudp, mm, curr_pud, nid, offset) { + pud_populate(mm, curr_pud, pmdp); + } + } else { + pud_populate(mm, pudp, pmdp); + } +} + +static inline void pmd_populate_replicated(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) +{ + int nid; + pmd_t *curr_pmd; + unsigned long offset; + + if (get_propagation_level() == PMD_PROPAGATION) { + for_each_pgtable_replica(pmdp, mm, curr_pmd, nid, offset) { + pmd_populate(mm, curr_pmd, ptep); + } + } else { + pmd_populate(mm, pmdp, ptep); + } +} + +#else + +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE() + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +#define this_node_pgd(mm) ((mm)->pgd) +#define per_node_pgd(mm, nid) ((mm)->pgd) + +static inline void numa_setup_pgd(void) +{ +} + +static inline void __init numa_replication_init(void) +{ +} + +static inline void __init numa_replicate_kernel_text(void) +{ +} + +static inline void numa_replicate_kernel_rodata(void) +{ +} + +static inline void numa_replication_fini(void) +{ +} + +static inline bool numa_addr_has_replica(const void *addr) +{ + return false; +} + +static inline bool is_text_replicated(void) +{ + return false; +} + +static inline void *numa_get_replica(void *vaddr, int nid) +{ + return lm_alias(vaddr); +} + +static inline void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +#define pgd_populate_replicated pgd_populate +#define p4d_populate_replicated p4d_populate +#define pud_populate_replicated pud_populate +#define pmd_populate_replicated pmd_populate + +#endif /*CONFIG_KERNEL_REPLICATION*/ +#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 95ac8398ee72d97015a61c4b1bdf5063d20a9e15..f70f70b152f089d9b13d5d0517c0ab1cdfdbf698 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -7,11 +7,32 @@ #ifdef CONFIG_ARCH_HAS_SET_MEMORY #include + +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_ro(unsigned long addr, int numpages); +int numa_set_memory_rw(unsigned long addr, int numpages); +int numa_set_memory_x(unsigned long addr, int numpages); +int numa_set_memory_nx(unsigned long addr, int numpages); +#else + +#define numa_set_memory_ro set_memory_ro +#define numa_set_memory_rw set_memory_rw +#define numa_set_memory_x set_memory_x +#define numa_set_memory_nx set_memory_nx + +#endif /* CONFIG_KERNEL_REPLICATION */ + #else static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } + +#define numa_set_memory_ro set_memory_ro +#define numa_set_memory_rw set_memory_rw +#define numa_set_memory_x set_memory_x +#define numa_set_memory_nx set_memory_nx + #endif #ifndef set_memory_rox @@ -24,6 +45,20 @@ static inline int set_memory_rox(unsigned long addr, int numpages) } #endif +#ifndef numa_set_memory_rox +#ifdef CONFIG_KERNEL_REPLICATION +static inline int numa_set_memory_rox(unsigned long addr, int numpages) +{ + int ret = numa_set_memory_ro(addr, numpages); + if (ret) + return ret; + return numa_set_memory_x(addr, numpages); +} +#else +#define numa_set_memory_rox set_memory_rox +#endif +#endif + #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP static inline int set_direct_map_invalid_noflush(struct page *page) { diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 333f5a67d17165d671ea8ca967dadcc35dcec8a4..c0a29ec901f7ba617411106ddc6660fde7e95547 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -29,6 +29,10 @@ struct iov_iter; /* in uio.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */ +#ifdef CONFIG_KERNEL_REPLICATION +#define VM_NUMA_SHARED 0x00002000 /* Pages shared between per-NUMA node TT*/ +#endif + #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ @@ -65,6 +69,10 @@ struct vm_struct { unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_EXTEND(int node) + KABI_EXTEND(bool replicated) +#endif }; struct vmap_area { @@ -156,6 +164,17 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); + +#ifdef CONFIG_KERNEL_REPLICATION + /* + * DO NOT USE this function if you don't understand what it is doing + * Use only in pair with vmalloc(vm_flags|=VM_NUMA_SHARED) + */ +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags); +void vunmap_range_replicas(unsigned long addr, unsigned long end); +#endif + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); diff --git a/init/main.c b/init/main.c index 8fdfa69dba0fa9b357db0d4af16f2313d5658ecd..c74606c30a82e1fff2b989c7f6674c68373a6e3f 100644 --- a/init/main.c +++ b/init/main.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include @@ -928,11 +929,19 @@ void start_kernel(void) * These use large bootmem allocations and must precede * initalization of page allocator */ + numa_replication_init(); setup_log_buf(0); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_core_init(); + /* + * Kernel text replication should be done before + * alloc/init first mm struct, due to it is necessary + * to setup per-NUMA node translation tables and kernel + * instances properly. + */ + numa_replicate_kernel_text(); poking_init(); ftrace_init(); @@ -1455,6 +1464,14 @@ static int __ref kernel_init(void *unused) free_initmem(); mark_readonly(); + /* + * RODATA replication is done here due to + * it is necessary to finalize the kernel + * and modules initialization before + */ + numa_replicate_kernel_rodata(); + numa_replication_fini(); + /* * Kernel mappings are now finalized - update the userspace page-table * to finalize PTI. diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fdc3e8705a3cb838e82409f0c025b1e39ea4dfec..62559463549f0c03d9fe2835fc1262d0da897108 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -515,7 +515,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - set_memory_rox((long)st_map->image, 1); + numa_set_memory_rox((long)st_map->image, 1); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -524,7 +524,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - set_memory_rox((long)st_map->image, 1); + numa_set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -547,8 +547,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + numa_set_memory_nx((long)st_map->image, 1); + numa_set_memory_rw((long)st_map->image, 1); reset_unlock: bpf_struct_ops_map_put_progs(st_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c00bb2ad00443f96f9b6991e59d22c46973de8e..edcedc3f104ad8e0f61f9662ad0b44b8367d6ca6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -893,7 +893,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + numa_set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } @@ -911,7 +911,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + numa_set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e5b97eb226e847af1faa2618f163cc8ec47ab4f0..8914f274c85aa5ff955607a5edecf22c15f9fd54 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -444,7 +444,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut if (err < 0) goto out_free; - set_memory_rox((long)im->image, 1); + numa_set_memory_rox((long)im->image, 1); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -465,8 +465,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut tr->fops->trampoline = 0; /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - set_memory_nx((long)im->image, 1); - set_memory_rw((long)im->image, 1); + numa_set_memory_nx((long)im->image, 1); + numa_set_memory_rw((long)im->image, 1); goto again; } #endif diff --git a/kernel/module/main.c b/kernel/module/main.c index 14a51af2fbeab213858112813894a667e0ddee5a..3aa696b127caf6bf8703c462e99d47e1ef5cec1f 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1209,13 +1210,40 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type) mod_mem_type_is_core_data(type); } +#ifdef CONFIG_KERNEL_REPLICATION +static int sections_to_replicate[] = {MOD_TEXT, MOD_RODATA}; + +static void module_replicate_sections(struct module *mod) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) + module_replicate(mod->mem[sections_to_replicate[i]].base); +} + static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) { + int i; + if (mod_mem_use_vmalloc(type)) return vzalloc(size); + + for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) { + if (type == sections_to_replicate[i]) + return module_alloc_replica(size); + } return module_alloc(size); } +#else /* !CONFIG_KERNEL_REPLICATION */ +static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) +{ + if (mod_mem_use_vmalloc(type)) + return vzalloc(size); + return module_alloc(size); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static void module_memory_free(void *ptr, enum mod_mem_type type) { if (mod_mem_use_vmalloc(type)) @@ -2752,6 +2780,10 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); module_cfi_finalize(info->hdr, info->sechdrs, mod); +#ifdef CONFIG_KERNEL_REPLICATION + module_replicate_sections(mod); +#endif + module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 575bf99c723af14fd5c28d80f5c4bf5e500fd458..9ecde639cb4ffc03fe8920a5b29fbaa8c28e6b5e 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -29,7 +29,7 @@ static void module_set_memory(const struct module *mod, enum mod_mem_type type, void module_enable_x(const struct module *mod) { for_class_mod_mem_type(type, text) - module_set_memory(mod, type, set_memory_x); + module_set_memory(mod, type, numa_set_memory_x); } #ifdef CONFIG_LIVEPATCH_WO_FTRACE @@ -59,13 +59,13 @@ void module_enable_ro(const struct module *mod, bool after_init) return; #endif - module_set_memory(mod, MOD_TEXT, set_memory_ro); - module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro); - module_set_memory(mod, MOD_RODATA, set_memory_ro); - module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro); + module_set_memory(mod, MOD_TEXT, numa_set_memory_ro); + module_set_memory(mod, MOD_INIT_TEXT, numa_set_memory_ro); + module_set_memory(mod, MOD_RODATA, numa_set_memory_ro); + module_set_memory(mod, MOD_INIT_RODATA, numa_set_memory_ro); if (after_init) - module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); + module_set_memory(mod, MOD_RO_AFTER_INIT, numa_set_memory_ro); } void module_enable_nx(const struct module *mod) @@ -74,7 +74,7 @@ void module_enable_nx(const struct module *mod) return; for_class_mod_mem_type(type, data) - module_set_memory(mod, type, set_memory_nx); + module_set_memory(mod, type, numa_set_memory_nx); } int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Kconfig b/mm/Kconfig index 782c43f08e8fead36161cdcb73920a55addcaab9..845ff9619d3efbc5f552a0e9c96e7256b58ba4bd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1298,6 +1298,16 @@ config LOCK_MM_AND_FIND_VMA bool depends on !STACK_GROWSUP +config KERNEL_REPLICATION + bool "Enable kernel text and ro-data replication across NUMA nodes" + default n + depends on ARM64 && MMU && NUMA && !MAXSMP + + help + Creates per-NUMA node replicas of kernel text and ro-data sections. + Page tables are replicated partially, according to replicated kernel memory range. + If unsure, say "n". + config IOMMU_MM_DATA bool diff --git a/mm/Makefile b/mm/Makefile index 11df2de8fdbe9d5a70e4ca3a73db68ebd9c9d331..45058cdf65d895ef3e77e9ba47f6acc833ddca90 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_KERNEL_REPLICATION) += numa_kernel_replication.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/memory.c b/mm/memory.c index a6d146d684e8ec18740b669f540abdaa32b5bcc4..f05772babfe08285cf27bd3a29de0aa1c16e7d2f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,6 +79,7 @@ #include #include #include +#include #include @@ -185,6 +186,96 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member) trace_rss_stat(mm, member); } +#ifdef CONFIG_KERNEL_REPLICATION + +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + unsigned long offset; + int nid; + pmd_t *curr_pmd; + pgtable_t token = pmd_pgtable(*pmd); + + if (get_propagation_level() == PMD_PROPAGATION) { + for_each_pgtable_replica(pmd, tlb->mm, curr_pmd, nid, offset) { + pmd_clear(curr_pmd); + } + } else { + pmd_clear(pmd); + } + + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); + (void)token; +} + +static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + unsigned long offset; + int nid; + pud_t *curr_pud; + pmd_t *pmd = pmd_offset(pud, addr); + + if (get_propagation_level() == PUD_PROPAGATION) { + for_each_pgtable_replica(pud, tlb->mm, curr_pud, nid, offset) { + pud_clear(curr_pud); + } + } else { + pud_clear(pud); + } + + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); + (void)pmd; +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + unsigned long offset; + int nid; + p4d_t *curr_p4d; + pud_t *pud = pud_offset(p4d, addr); + + if (get_propagation_level() == P4D_PROPAGATION) { + for_each_pgtable_replica(p4d, tlb->mm, curr_p4d, nid, offset) { + p4d_clear(curr_p4d); + } + } else { + p4d_clear(p4d); + } + + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); + (void)pud; +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + unsigned long offset; + int nid; + pgd_t *curr_pgd; + p4d_t *p4d = p4d_offset(pgd, addr); + + if (get_propagation_level() == PGD_PROPAGATION) { + for_each_pgtable_replica(pgd, tlb->mm, curr_pgd, nid, offset) { + pgd_clear(curr_pgd); + } + } else { + pgd_clear(pgd); + } + p4d_free_tlb(tlb, p4d, addr); + /* + * Why? If 4-level paging is enabled via kconfig, + * all functions execept p4d_offset are empty, + * and we get unused variable error + */ + (void)p4d; +} +#else + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -196,8 +287,43 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); + (void)token; } +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + pmd_t *pmd = pmd_offset(pud, addr); + + pud_clear(pud); + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); + (void)pmd; +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + pud_t *pud = pud_offset(p4d, addr); + + p4d_clear(p4d); + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); + (void)pud; +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + + pgd_clear(pgd); + p4d_free_tlb(tlb, p4d, addr); + (void)p4d; +} + +#endif + static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) @@ -226,10 +352,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, if (end - 1 > ceiling - 1) return; - pmd = pmd_offset(pud, start); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start); - mm_dec_nr_pmds(tlb->mm); + __free_pmd_range(tlb, pud, start); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, @@ -260,10 +383,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, if (end - 1 > ceiling - 1) return; - pud = pud_offset(p4d, start); - p4d_clear(p4d); - pud_free_tlb(tlb, pud, start); - mm_dec_nr_puds(tlb->mm); + __free_pud_range(tlb, p4d, start); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -294,9 +414,7 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - p4d = p4d_offset(pgd, start); - pgd_clear(pgd); - p4d_free_tlb(tlb, p4d, start); + __free_p4d_range(tlb, pgd, start); } /* @@ -440,7 +558,7 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - pmd_populate(mm, pmd, *pte); + pmd_populate_replicated(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); @@ -2908,7 +3026,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, return err; } -static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, +static int __apply_to_page_range(struct mm_struct *mm, pgd_t *pgtable, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { @@ -2921,7 +3039,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (WARN_ON(addr >= end)) return -EINVAL; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) @@ -2952,10 +3070,32 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { - return __apply_to_page_range(mm, addr, size, fn, data, true); + return __apply_to_page_range(mm, mm->pgd, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); +#ifdef CONFIG_KERNEL_REPLICATION +/* + * Same as apply_to_page_range(), but taking into account per-NUMA node + * replicas. + */ +int apply_to_page_range_replicas(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + int nid; + int ret = 0; + + for_each_memory_node(nid) { + ret = __apply_to_page_range(mm, per_node_pgd(mm, nid), + addr, size, fn, data, true); + if (ret) + break; + } + + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ + /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. @@ -2966,7 +3106,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { - return __apply_to_page_range(mm, addr, size, fn, data, false); + return __apply_to_page_range(mm, mm->pgd, addr, size, fn, data, false); } EXPORT_SYMBOL_GPL(apply_to_existing_page_range); @@ -6013,6 +6153,28 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #endif /* CONFIG_PER_VMA_LOCK */ #ifndef __PAGETABLE_P4D_FOLDED + +#ifdef CONFIG_KERNEL_REPLICATION +int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) { /* Another has populated it */ + p4d_free(mm, new); + } else { + smp_wmb(); /* See comment in pmd_install() */ + pgd_populate(mm, pgd, new); + } + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * Allocate p4d page table. * We've already handled the fast-path in-line. @@ -6028,7 +6190,7 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ - pgd_populate(mm, pgd, new); + pgd_populate_replicated(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; @@ -6036,6 +6198,28 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED + +#ifdef CONFIG_KERNEL_REPLICATION +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + pud_t *new = pud_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ + p4d_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * Allocate page upper directory. * We've already handled the fast-path in-line. @@ -6050,7 +6234,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ - p4d_populate(mm, p4d, new); + p4d_populate_replicated(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); @@ -6074,13 +6258,37 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ - pud_populate(mm, pud, new); + pud_populate_replicated(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } spin_unlock(ptl); return 0; } + +#ifdef CONFIG_KERNEL_REPLICATION +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + spinlock_t *ptl; + pmd_t *new = pmd_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + ptl = pud_lock(mm, pud); + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ + pud_populate(mm, pud, new); + } else { /* Another has populated it */ + pmd_free(mm, new); + } + spin_unlock(ptl); + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #endif /* __PAGETABLE_PMD_FOLDED */ /** @@ -6655,3 +6863,63 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +/** + * Walk in replicated tranlation table specified by nid. + * If kernel replication is disabled or text is not replicated yet, + * value of nid is not used + */ +struct page *walk_to_page_node(int nid, const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long)vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (is_text_replicated()) + pgd = pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + else + pgd = pgd_offset_pgd(init_mm.pgd, addr); + + if (pgd_none(*pgd)) + return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + + return page; +} diff --git a/mm/mm_init.c b/mm/mm_init.c index 0a3c20a003187665758beece2b57fcf5d0ee779a..72fc5f972d3619674e2b1eaaebaf342fe2a5e4dd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2819,6 +2819,9 @@ void __init mm_core_init(void) ptlock_cache_init(); pgtable_cache_init(); debug_objects_mem_init(); +#ifdef CONFIG_KERNEL_REPLICATION + preallocate_vmalloc_pages(); +#endif vmalloc_init(); /* If no deferred init page_ext now, as vmap is fully initialized */ if (!deferred_struct_pages) diff --git a/mm/numa_kernel_replication.c b/mm/numa_kernel_replication.c new file mode 100644 index 0000000000000000000000000000000000000000..c2d289b7b9dfc3b1b79600cf086f1e3f77d81508 --- /dev/null +++ b/mm/numa_kernel_replication.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define KERNEL_TEXT_START ((unsigned long)&_stext) +#define KERNEL_TEXT_END ((unsigned long)&_etext) + +#define KERNEL_RODATA_START ((unsigned long)&__start_rodata) +#define KERNEL_RODATA_END ((unsigned long)&__end_rodata) + +#define PMD_ALLOC_ORDER (PMD_SHIFT-PAGE_SHIFT) +#define PAGES_PER_PMD (1 << PMD_ALLOC_ORDER) + +#define replication_log(data, fmt, args...) \ +({ \ + if (data && data->m) \ + seq_printf(data->m, fmt, ##args); \ + else \ + pr_info(KERN_CONT fmt, ##args); \ +}) + +struct numa_node_desc { + pgd_t *pgd; + void *text_vaddr; + void *rodata_vaddr; +}; + +static struct numa_node_desc __initdata_or_module node_desc[MAX_NUMNODES]; + +struct dump_data { + struct seq_file *m; +}; + +struct dump_config { + int pgd_extra_info:1; + int p4d_extra_info:1; + int pud_extra_info:1; + int pmd_extra_info:1; + int pte_extra_info:1; + struct dump_data *data; +}; + +static bool text_replicated; +static propagation_level_t prop_level = NONE; +/* + * The first ready NUMA node, used as a source node + * for kernel text and rodata replication + */ +static unsigned int master_node = INT_MAX; +/* + * The case when machine has memoryless nodes is rare + * but possible. To handle memoryless nodes properly + * kernel replication maintains mapping node -> node with memory + * for all NUMA nodes. + */ +static int node_to_memory_node[MAX_NUMNODES]; + +static bool pgtables_extra; +static DEFINE_SPINLOCK(debugfs_lock); + +propagation_level_t get_propagation_level(void) +{ + return prop_level; +} + +bool is_text_replicated(void) +{ + return text_replicated; +} + +static void binary_dump(struct dump_data *data, unsigned long value) +{ + int i; + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "%-9d", i); + } + replication_log(data, "%d\n", 0); + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "|"); + + replication_log(data, "%d", (1UL << i) & value ? 1 : 0); + } + replication_log(data, "|\n"); +} + +static int pgd_callback(pgd_t *pgd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pgd_val(*pgd); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PGDIR_MASK; + next = (addr & PGDIR_MASK) - 1 + PGDIR_SIZE; + + replication_log(c->data, + "PGD ADDR: 0x%p PGD VAL: 0x%016lx [%p --- %p]\n", + pgd, val, (void *)addr, (void *)next); + + if (c->pgd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int p4d_callback(p4d_t *p4d, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = p4d_val(*p4d); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & P4D_MASK; + next = (addr & P4D_MASK) - 1 + P4D_SIZE; + + replication_log(c->data, + "P4D ADDR: 0x%p P4D VAL: 0x%016lx [%p --- %p]\n", + p4d, val, (void *)addr, (void *)next); + + if (c->p4d_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pud_callback(pud_t *pud, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pud_val(*pud); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PUD_MASK; + next = (addr & PUD_MASK) - 1 + PUD_SIZE; + + replication_log(c->data, + "PUD ADDR: 0x%p PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", + pud, val, pud_huge(*pud), (void *)addr, (void *)next); + + if (c->pud_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pmd_callback(pmd_t *pmd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pmd_val(*pmd); + unsigned long paddr = pmd_pfn(*pmd) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PMD_MASK; + next = (addr & PMD_MASK) - 1 + PMD_SIZE; + + replication_log(c->data, + "PMD ADDR: 0x%p PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", + pmd, val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); + + if (c->pmd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_callback(pte_t *pte, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pte_val(*pte); + unsigned long paddr = pte_pfn(*pte) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PAGE_MASK; + next = (addr & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(c->data, + "PTE ADDR: 0x%p PTE VAL: 0x%016lx [%p --- %p] to %p\n", + pte, val, (void *)addr, (void *)next, (void *)paddr); + + if (c->pte_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_hole_callback(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + struct dump_config *c = (struct dump_config *)walk->private; + + replication_log(c->data, "%*chole\n", depth * 2, ' '); + + return 0; +} + +static void dump_pgtables(struct mm_struct *mm, + struct dump_data *data, + unsigned long start, unsigned long end) +{ + int nid = 0; + int extra = pgtables_extra ? 1 : 0; + bool locked = false; + struct dump_config conf = { + .pgd_extra_info = extra, + .p4d_extra_info = extra, + .pud_extra_info = extra, + .pmd_extra_info = extra, + .pte_extra_info = extra, + .data = data, + }; + + const struct mm_walk_ops ops = { + .pgd_entry = pgd_callback, + .p4d_entry = p4d_callback, + .pud_entry = pud_callback, + .pmd_entry = pmd_callback, + .pte_entry = pte_callback, + .pte_hole = pte_hole_callback + }; + + BUG_ON(data && data->m == NULL); + + start = start & PAGE_MASK; + end = (end & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); + + if (rwsem_is_locked(&mm->mmap_lock)) + locked = true; + else + mmap_read_lock(mm); + + for_each_memory_node(nid) { + replication_log(data, "NUMA node id #%d\n", nid); + replication_log(data, "PGD: %p PGD phys: %p\n", + mm->pgd_numa[nid], (void *)virt_to_phys(mm->pgd_numa[nid])); + walk_page_range_novma(mm, start, end, &ops, mm->pgd_numa[nid], &conf); + } + + if (!locked) + mmap_read_unlock(mm); + + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); +} + +static void dump_kernel_pgtables(struct dump_data *data, + unsigned long start, unsigned long end) +{ + dump_pgtables(&init_mm, data, start, end); +} + +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + dump_pgtables(mm, NULL, start, end); +} + +static void cpu_dump(void *info) +{ + struct dump_data *data = (struct dump_data *)info; + + spin_lock(&debugfs_lock); + numa_cpu_dump(data->m); + spin_unlock(&debugfs_lock); +} + +static int stats_show(struct seq_file *m, void *v) +{ + int cpu; + struct dump_data data = { + .m = m, + }; + + for_each_online_cpu(cpu) + smp_call_function_single(cpu, cpu_dump, &data, 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(stats); + +static int pgtables_show(struct seq_file *m, void *v) +{ + struct dump_data data = { + .m = m, + }; + + dump_kernel_pgtables(&data, + KERNEL_TEXT_START, KERNEL_RODATA_END - 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(pgtables); + +void debugfs_init(void) +{ + struct dentry *dir; + static struct dentry *debugfs_dir; + + debugfs_dir = debugfs_create_dir("numa_replication", NULL); + if (IS_ERR(debugfs_dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication: %ld\n", + PTR_ERR(debugfs_dir)); + return; + } + dir = debugfs_create_file("stats", 0400, debugfs_dir, + NULL, &stats_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication stats: %ld\n", + PTR_ERR(dir)); + return; + } + + dir = debugfs_create_file("pgtables_kernel", 0400, debugfs_dir, + NULL, &pgtables_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication pgtables: %ld\n", + PTR_ERR(dir)); + return; + } + + debugfs_create_bool("pgtables_kernel_extra", 0600, debugfs_dir, + &pgtables_extra); +} + +/* + * The case, when machine has memoryless NUMA nodes + * should be handled in a special way. To do this we + * create node<->memory mapping to have an information + * about the node with memory that memoryless node can use. + */ +static void init_node_to_memory_mapping(void) +{ + int nid; + + for_each_online_node(nid) { + int memory_nid; + int min_dist = INT_MAX; + + node_to_memory_node[nid] = nid; + for_each_memory_node(memory_nid) { + int dist = node_distance(nid, memory_nid); + + if (dist < min_dist) { + min_dist = dist; + node_to_memory_node[nid] = memory_nid; + } + } + pr_info("For node %d memory is on the node - %d\n", + nid, node_to_memory_node[nid]); + } +} + +int numa_get_memory_node(int nid) +{ + return node_to_memory_node[nid]; +} + +/* + * The function creates replica of particular memory area + * and install replicated memory in translation table of + * required NUMA node. + */ +static void replicate_memory(void *dst, unsigned long start, unsigned long end, int nid) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + pgprot_t prot; + unsigned int offset_in_pages = 0; + unsigned long vaddr = start; + struct page *pages = virt_to_page(dst); + + memcpy(dst, lm_alias(start), end - start); + while (vaddr < end) { + pgd = pgd_offset_pgd(node_desc[nid].pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + + if (pmd_leaf(*pmd)) { + prot = pmd_pgprot(*pmd); + + set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages += PAGES_PER_PMD; + vaddr += PMD_SIZE; + continue; + } + pte = pte_offset_kernel(pmd, vaddr); + prot = pte_pgprot(*pte); + __set_pte(pte, pfn_pte(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages++; + vaddr += PAGE_SIZE; + } +} + +static void __init replicate_kernel_text(int nid) +{ + replicate_memory(node_desc[nid].text_vaddr, + KERNEL_TEXT_START, KERNEL_TEXT_END, nid); + numa_sync_text_replicas((unsigned long)node_desc[nid].text_vaddr, + (unsigned long)node_desc[nid].text_vaddr + (KERNEL_TEXT_END - KERNEL_TEXT_START)); +} + +static void replicate_kernel_rodata(int nid) +{ + replicate_memory(node_desc[nid].rodata_vaddr, + KERNEL_RODATA_START, KERNEL_RODATA_END, nid); +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pte(pud_t *dst, pud_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PMD_MASK; + unsigned long right = (end & PMD_MASK) - 1 + PMD_SIZE; + unsigned long addr; + + pmd_t *clone_pmd = pmd_offset(dst, left); + pmd_t *orig_pmd = pmd_offset(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += PMD_SIZE) { + pgtable_t new_pte; + + if (pmd_none(*orig_pmd) || pmd_huge(*orig_pmd) || + pmd_val(*orig_pmd) == 0) + goto skip; + + pmd_clear(clone_pmd); + new_pte = pte_alloc_one_node(nid, &init_mm); + pmd_populate_kernel(&init_mm, clone_pmd, page_to_virt(new_pte)); + BUG_ON(new_pte == NULL); + + copy_page(page_to_virt(pmd_pgtable(*clone_pmd)), + page_to_virt(pmd_pgtable(*orig_pmd))); +skip: + clone_pmd++; + orig_pmd++; + } +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PUD_MASK; + unsigned long right = (end & PUD_MASK) - 1 + PUD_SIZE; + + pud_t *clone_pud = pud_offset(dst, left); + pud_t *orig_pud = pud_offset(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += PUD_SIZE) { + pmd_t *new_pmd; + + if (pud_none(*orig_pud) || pud_huge(*orig_pud) || + pud_val(*orig_pud) == 0) + goto skip; + + pud_clear(clone_pud); + new_pmd = pmd_alloc_node(nid, &init_mm, clone_pud, addr); + BUG_ON(new_pmd == NULL); + + copy_page(pud_pgtable(*clone_pud), pud_pgtable(*orig_pud)); + + replicate_pgt_pte(clone_pud, orig_pud, max(addr, start), + min(addr - 1 + PUD_SIZE, end), nid); +skip: + clone_pud++; + orig_pud++; + } +} + +static void replicate_pgt_pud(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & P4D_MASK; + unsigned long right = (end & P4D_MASK) - 1 + P4D_SIZE; + + p4d_t *clone_p4d = p4d_offset(dst, left); + p4d_t *orig_p4d = p4d_offset(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += P4D_SIZE) { + pud_t *new_pud; + + if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d) || + p4d_val(*orig_p4d) == 0) + goto skip; + + p4d_clear(clone_p4d); + new_pud = pud_alloc_node(nid, &init_mm, clone_p4d, addr); + BUG_ON(new_pud == NULL); + + copy_page(p4d_pgtable(*clone_p4d), p4d_pgtable(*orig_p4d)); + /* + * start and end passed to the next function must be in + * range of p4ds, so min and max are used here + */ + replicate_pgt_pmd(clone_p4d, orig_p4d, max(addr, start), + min(addr - 1 + P4D_SIZE, end), nid); +skip: + clone_p4d++; + orig_p4d++; + } +} + +static void replicate_pgt_p4d(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PGDIR_MASK; + unsigned long right = (end & PGDIR_MASK) - 1 + PGDIR_SIZE; + + pgd_t *clone_pgd = pgd_offset_pgd(dst, left); + pgd_t *orig_pgd = pgd_offset_pgd(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += PGDIR_SIZE) { + p4d_t *new_p4d; + + /* TODO: remove last condition and do something better + * In the case of a folded P4D level, pgd_none and pgd_huge + * always return 0, so we might start to replicate empty entries. + * We obviously want to avoid this, so the last check is performed here. + */ + if (pgd_none(*orig_pgd) || pgd_huge(*orig_pgd) || + pgd_val(*orig_pgd) == 0) + goto skip; + + pgd_clear(clone_pgd); + new_p4d = p4d_alloc_node(nid, &init_mm, clone_pgd, addr); + BUG_ON(new_p4d == NULL); + + copy_page((void *)pgd_page_vaddr(*clone_pgd), + (void *)pgd_page_vaddr(*orig_pgd)); + replicate_pgt_pud(clone_pgd, orig_pgd, max(addr, start), + min(addr - 1 + PGDIR_SIZE, end), nid); +skip: + clone_pgd++; + orig_pgd++; + } +} + +static void replicate_pgt(int nid, unsigned long start, unsigned long end) +{ + replicate_pgt_p4d(node_desc[nid].pgd, init_mm.pgd, start, end, nid); +} + +/* + * Page tables replication works in a way when first + * pgd level replicated and then the replication of the + * left part if done. The only part of pagetable that + * contains text and rodata is replicated. Obviously a + * part of upper layer entries of page table should be + * replicated too. As result, the pgd, p4d, pud and pmd + * layers are touched by replication. In particular, the + * page table sub-tree that cover kernel text and rodata. + */ +static void replicate_pgtables(void) +{ + int nid; + + init_mm.pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + BUG_ON(!init_mm.pgd_numa); + + for_each_memory_node(nid) { + node_desc[nid].pgd = numa_replicate_pgt_pgd(nid); + replicate_pgt(nid, PAGE_TABLE_REPLICATION_LEFT, + PAGE_TABLE_REPLICATION_RIGHT); + } + + init_mm.pgd = node_desc[numa_get_memory_node(0)].pgd; + + for_each_online_node(nid) { + int memory_nid = numa_get_memory_node(nid); + + init_mm.pgd_numa[nid] = node_desc[memory_nid].pgd; + } +} + +/* + * Kernel text replication includes two steps: + * 1. page tables replication for init_mm + * 2. kernel text pages replication and + * corresponding page table update. + * 3. setup page table, related to + * current NUMA node on current cpu, + * for other NUMA cpus page tables will + * be updated later, during cpu initialization. + * Master node - the first NUMA node, used as + * a source for replicas. Memory for master node + * is expected to be already local. + */ +void __init numa_replicate_kernel_text(void) +{ + int nid; + + replicate_pgtables(); + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_text(nid); + } + + text_replicated = true; + + if (!mm_p4d_folded(&init_mm)) + prop_level = PGD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && !mm_pud_folded(&init_mm)) + prop_level = P4D_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && !mm_pmd_folded(&init_mm)) + prop_level = PUD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && mm_pmd_folded(&init_mm)) + prop_level = PMD_PROPAGATION; + + BUG_ON(prop_level == NONE); + + numa_setup_pgd(); +} + +void numa_replicate_kernel_rodata(void) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_rodata(nid); + } + + flush_tlb_all(); +} + +void numa_setup_pgd(void) +{ + numa_load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]); +} + +void __init_or_module *numa_get_replica(void *vaddr, int nid) +{ + unsigned long addr = (unsigned long)vaddr; + unsigned long offset = addr - KERNEL_TEXT_START; + + BUG_ON(addr < KERNEL_TEXT_START || addr >= KERNEL_TEXT_END); + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(numa_get_memory_node(nid) != nid); + + return node_desc[nid].text_vaddr + offset; +} + +nodemask_t __ro_after_init replica_nodes = { { [0] = 1UL } }; + +void __init numa_replication_init(void) +{ + int nid; + + unsigned long align = PAGE_SIZE; +#ifdef CONFIG_ARM64_4K_PAGES + align = HPAGE_SIZE; +#else + align = CONT_PTE_SIZE; +#endif + nodes_clear(replica_nodes); + + for_each_node_state(nid, N_MEMORY) { + __node_set(nid, &replica_nodes); + } + + for_each_memory_node(nid) + pr_info("Memory node: %d\n", nid); + + init_node_to_memory_mapping(); + master_node = page_to_nid(virt_to_page(lm_alias((void *)KERNEL_TEXT_START))); + + pr_info("Master Node: #%d\n", master_node); + for_each_memory_node(nid) { + if (nid == master_node) { + node_desc[nid].text_vaddr = lm_alias((void *)KERNEL_TEXT_START); + node_desc[nid].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START); + } else { + node_desc[nid].text_vaddr = memblock_alloc_try_nid( + (KERNEL_TEXT_END - KERNEL_TEXT_START), + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + + node_desc[nid].rodata_vaddr = memblock_alloc_try_nid( + (KERNEL_RODATA_END - KERNEL_RODATA_START), + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + } + + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(node_desc[nid].rodata_vaddr == NULL); + } +} + +void numa_replication_fini(void) +{ + int nid; + + /* + * Clear addresses form linear space + */ + for_each_memory_node(nid) { + node_desc[nid].text_vaddr = NULL; + node_desc[nid].rodata_vaddr = NULL; + } + + debugfs_init(); + + pr_info("Replicated page table : [%p --- %p]\n", + (void *)PAGE_TABLE_REPLICATION_LEFT, + (void *)PAGE_TABLE_REPLICATION_RIGHT); + + dump_kernel_pgtables(NULL, KERNEL_TEXT_START, KERNEL_RODATA_END - 1); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36cd38df06140f01c03413e74e8dc90bc1c4df81..d2b1191efa284e99df96882a594bd591dad7234e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4872,6 +4872,26 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); +#ifdef CONFIG_KERNEL_REPLICATION +unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(nid, gfp_mask & ~__GFP_HIGHMEM, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pages_node); + +unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask) +{ + return __get_free_pages_node(nid, gfp_mask | __GFP_ZERO, 0); +} +EXPORT_SYMBOL(get_zeroed_page_node); +#endif /* CONFIG_KERNEL_REPLICATION */ + /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb0951fea2385f9725b995056e4fe8c104871ef7..a136e86e6480242553b3546edb43d2ef5b0941b8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -420,18 +421,17 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } /* - * vunmap_range_noflush is similar to vunmap_range, but does not - * flush caches or TLBs. + * vunmap_range_noflush_pgd is similar to vunmap_range, but does not + * flush caches or TLBs, and able to work with pgd granularity. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). - * - * This is an internal function only. Do not use outside mm/. */ -void __vunmap_range_noflush(unsigned long start, unsigned long end) +static void vunmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -439,7 +439,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -453,6 +453,17 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +/* + * vunmap_range_noflush is similar to vunmap_range_noflush_pgd, but works + * only with init_mm->pgd. + * + * This is an internal function only. Do not use outside mm/. + */ +void __vunmap_range_noflush(unsigned long start, unsigned long end) +{ + vunmap_range_noflush_pgd(init_mm.pgd, start, end); +} + void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); @@ -475,6 +486,18 @@ void vunmap_range(unsigned long addr, unsigned long end) flush_tlb_kernel_range(addr, end); } +#ifdef CONFIG_KERNEL_REPLICATION +void vunmap_range_replicas(unsigned long addr, unsigned long end) +{ + int nid; + + flush_cache_vunmap(addr, end); + for_each_memory_node(nid) + vunmap_range_noflush_pgd(init_mm.pgd_numa[nid], addr, end); + flush_tlb_kernel_range(addr, end); +} +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ + static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) @@ -560,7 +583,8 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, +static int vmap_small_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; @@ -571,7 +595,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -587,8 +611,38 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +static int vmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_pgd(pgtable, addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + /* - * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * vmap_pages_range_noflush_pgd is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this @@ -596,8 +650,10 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) +static int vmap_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -605,12 +661,13 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) - return vmap_small_pages_range_noflush(addr, end, prot, pages); + return vmap_small_pages_range_noflush_pgd(pgtable, addr, end, + prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; - err = vmap_range_noflush(addr, addr + (1UL << page_shift), + err = vmap_range_noflush_pgd(pgtable, addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) @@ -630,7 +687,8 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (ret) return ret; - return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + + return vmap_pages_range_noflush_pgd(init_mm.pgd, addr, end, prot, pages, page_shift); } /** @@ -730,57 +788,12 @@ EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); */ struct page *vmalloc_to_page(const void *vmalloc_addr) { - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - - if (pgd_none(*pgd)) - return NULL; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return NULL; /* XXX: no allowance for huge pgd */ - if (WARN_ON_ONCE(pgd_bad(*pgd))) - return NULL; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) - return NULL; - if (p4d_leaf(*p4d)) - return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(p4d_bad(*p4d))) - return NULL; - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) - return NULL; - if (pud_leaf(*pud)) - return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pud_bad(*pud))) - return NULL; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return NULL; - if (pmd_leaf(*pmd)) - return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pmd_bad(*pmd))) - return NULL; - - ptep = pte_offset_kernel(pmd, addr); - pte = ptep_get(ptep); - if (pte_present(pte)) - page = pte_page(pte); - - return page; + return walk_to_page_node(first_memory_node, vmalloc_addr); } EXPORT_SYMBOL(vmalloc_to_page); @@ -2357,7 +2370,22 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica((void *)va->va_start)) { + int node; + /** + * In some scenarios we might clear + * empty entries here, which is totally fine + */ + for_each_memory_node(node) + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + va->va_start, va->va_end); + } else { + vunmap_range_noflush(va->va_start, va->va_end); + } +#else vunmap_range_noflush(va->va_start, va->va_end); +#endif /* CONFIG_KERNEL_REPLICATION */ if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -3216,16 +3244,73 @@ struct vm_struct *remove_vm_area(const void *addr) return vm; } +#ifdef CONFIG_KERNEL_REPLICATION +static inline void set_direct_map_page_replicas(const struct vm_struct *area, + struct page *page, + int (*set_direct_map)(struct page *page)) +{ + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + if (page_address(cursor)) + set_direct_map(cursor); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ - for (i = 0; i < area->nr_pages; i++) + for (i = 0; i < area->nr_pages; i++) { if (page_address(area->pages[i])) set_direct_map(area->pages[i]); +#ifdef CONFIG_KERNEL_REPLICATION + set_direct_map_page_replicas(area, + area->pages[i], set_direct_map); +#endif /* CONFIG_KERNEL_REPLICATION */ + } +} + +#ifdef CONFIG_KERNEL_REPLICATION +static void vm_account_replicated_range(struct vm_struct *area, + struct page *page, + unsigned long *s, + unsigned long *e, + int *flush) +{ + int flush_dmap = 0; + unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); + + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + unsigned long addr = (unsigned long)page_address(cursor); + + if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; + start = min(addr, start); + end = max(addr + page_size, end); + flush_dmap = 1; + } + } + } + + if (flush_dmap) + *flush = flush_dmap; + + *s = start; + *e = end; } +#endif /* CONFIG_KERNEL_REPLICATION */ /* * Flush the vm mapping and reset the direct map. @@ -3252,6 +3337,10 @@ static void vm_reset_perms(struct vm_struct *area) end = max(addr + page_size, end); flush_dmap = 1; } +#ifdef CONFIG_KERNEL_REPLICATION + vm_account_replicated_range(area, area->pages[i], + &start, &end, &flush_dmap); +#endif /* CONFIG_KERNEL_REPLICATION */ } /* @@ -3297,6 +3386,28 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +#ifdef CONFIG_KERNEL_REPLICATION +static void vfree_page_replicas(struct vm_struct *area, struct page *page) +{ + if (area->replicated) { + struct page *cursor, *tmp; + + list_for_each_entry_safe(cursor, tmp, &page->lru, lru) { + BUG_ON(!cursor); + + list_del(&cursor->lru); + mod_memcg_page_state(cursor, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(cursor, 0); + cond_resched(); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3343,6 +3454,9 @@ void vfree(const void *addr) for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; +#ifdef CONFIG_KERNEL_REPLICATION + vfree_page_replicas(vm, page); +#endif /* CONFIG_KERNEL_REPLICATION */ BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* @@ -3600,26 +3714,91 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static int vmalloc_map_area_pages_pgd(unsigned long addr, + struct page **pages, unsigned long size, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift, pgd_t *pgd) +{ + int ret = 0; + unsigned int flags; + bool nofail = gfp_mask & __GFP_NOFAIL; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range_noflush_pgd(pgd, addr, addr + size, + prot, pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to map pages", + size); + } + + return ret; +} + +static int vmalloc_map_area_pages(unsigned long addr, unsigned long size, + struct vm_struct *area, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift) +{ + int ret; +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + if (area->flags & VM_NUMA_SHARED) { + for_each_memory_node(nid) { + pgd_t *pgd = per_node_pgd(&init_mm, nid); + + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, pgd); + if (ret) + return ret; + } + } else { + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); + } +#else + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); +#endif /* CONFIG_KERNEL_REPLICATION */ + return ret; +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; - unsigned int flags; - int ret; + int ret = 0; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; - /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); @@ -3631,8 +3810,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3671,33 +3849,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - /* - * page tables allocations ignore external gfp mask, enforce it - * by the scope API - */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - - do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); - if (nofail && (ret < 0)) - schedule_timeout_uninterruptible(1); - } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); - - if (ret < 0) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, failed to map pages", - area->nr_pages * PAGE_SIZE); + ret = vmalloc_map_area_pages(addr, size, area, gfp_mask, prot, page_shift); + if (ret) goto fail; - } + flush_cache_vmap(addr, addr + size); return area->addr; @@ -3797,6 +3952,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica(area->addr)) + vm_flags |= VM_NUMA_SHARED; + area->node = node; +#endif /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). @@ -3891,6 +4051,129 @@ void *__vmalloc_node(unsigned long size, unsigned long align, return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } + +#ifdef CONFIG_KERNEL_REPLICATION +static void numa_replicate_page_range(struct page **src, struct page **dst, int nr_pages) +{ + int i; + void *from, *to; + + for (i = 0; i < nr_pages; i++) { + from = kmap(src[i]); + to = kmap(dst[i]); + + copy_page(to, from); + + kunmap(src[i]); + kunmap(dst[i]); + } +} + +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags) +{ + int i, ret, node = 0; + struct vm_struct *area; + unsigned int page_order; + unsigned int nr_allocated; + struct page **pages; + unsigned long area_start, area_end; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + unsigned long array_size; + + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + + if (unlikely(!numa_addr_has_replica(addr))) + return -EINVAL; + + area = find_vm_area(addr); + if (unlikely(!area)) + return -ENOENT; + + if (area->node == NUMA_NO_NODE) + return -EINVAL; + + array_size = sizeof(struct page *) * area->nr_pages; + if (array_size > PAGE_SIZE) + pages = __vmalloc(array_size, nested_gfp); + else + pages = kmalloc(array_size, nested_gfp); + + if (!pages) + return -ENOMEM; + + page_order = vm_area_page_order(area); + for (i = 0; i < area->nr_pages; i++) + INIT_LIST_HEAD(&area->pages[i]->lru); + + area_start = (unsigned long)area->addr; + area_end = (unsigned long)(area->addr + area->nr_pages * PAGE_SIZE); + + for_each_memory_node(node) { + if (area->node == node) + continue; + + nr_allocated = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, area->nr_pages, pages); + if (nr_allocated != area->nr_pages) + goto fail_alloc_pages; + + for (i = 0; i < area->nr_pages; i++) + list_add(&pages[i]->lru, &area->pages[i]->lru); + + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + area_start, area_end); + + /* + * We can't fail here (hopefully) + * Possible errors: not enough memory for tables and not empty entries. + * Both unrealistic because we just cleared entries in existed tables. + */ + ret = vmalloc_map_area_pages_pgd(area_start, pages, + nr_allocated * PAGE_SIZE, + gfp_mask, prot, PAGE_SHIFT, + per_node_pgd(&init_mm, node)); + if (ret != 0) + goto fail_map_pages; + + atomic_long_add(nr_allocated, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + for (i = 0; i < nr_allocated; i++) + mod_memcg_page_state(pages[i], MEMCG_VMALLOC, 1); + } + numa_replicate_page_range(area->pages, pages, area->nr_pages); + + for (i = 0; i < area->nr_pages; i++) + pages[i] = NULL; + } + kvfree(pages); + + flush_tlb_kernel_range(area_start, area_end); + area->replicated = true; + + return 0; +fail_alloc_pages: + for (i = 0; i < nr_allocated; i++) + __free_pages(pages[i], 0); + +fail_map_pages: + kfree(pages); + for (i = 0; i < area->nr_pages; i++) { + struct page *page, *tmp; + + list_for_each_entry_safe(page, tmp, &area->pages[i]->lru, lru) { + list_del(&page->lru); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + __free_pages(page, 0); + } + } + + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 5918d1b32e196005330fe51b0dabcb5b7709b910..45a5dbd379ac1bbde88a8d1ffd36bda264154614 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -124,7 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - set_memory_rox((long)image, 1); + numa_set_memory_rox((long)image, 1); prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args);