diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 7ea03154ad05cb7de5177855091035771e1d5c99..b8d554996c579557443993989b77ac2ac43025bb 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6285,6 +6285,8 @@ CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y # CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set +CONFIG_ENHANCED_HUGETLB_MMAP=y +CONFIG_EXEC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 2efddfcafb1c79c9447743aca21d49aaace71464..6abf47dabee013ba3d871eaf06df80adb6de4f19 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7373,6 +7373,8 @@ CONFIG_HUGETLB_PAGE=y CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y # CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_DYNAMIC_HUGETLB=y +CONFIG_ENHANCED_HUGETLB_MMAP=y +CONFIG_EXEC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/fs/Kconfig b/fs/Kconfig index aa097ca64ef6ab9d61413dea9faea5cbc3d9d89a..9bc9808694eabb420df102bb861a1e1b33b59e8e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -270,6 +270,30 @@ config DYNAMIC_HUGETLB pages automatically. The tasks in the memcg prefer to alloc dynamic hugepage. +config ENHANCED_HUGETLB_MMAP + bool "enhanced hugetlb mmap" + default n + depends on HUGETLBFS + help + Add private file mmap for hugetlb. + This feature adds vm_actual_file in vma to record the original file and + copies file contents to hugetlb pages during page fault. + Procfs and perf record will show file name of vm_actual_file. + Hugetlb is useful for optimizing TLB miss rate, and this feature is + aimed to extend its usage. + +config EXEC_HUGETLB + bool "use hugetlb in exec" + default n + depends on ENHANCED_HUGETLB_MMAP + help + Some applications suffer from high TLB miss, and users don't like + transparent hugepaged. (A background thread will affect overall + performance and madvise after exec is too late) + This feature provides another way to use huge page for apps by + using hugetlb map in exec. + Only support ELF format now. + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ed507d27034b1dc804e6482d7403bd6bcc0025cd..16abbd7f6fc2cb775578e0260572447d542faf85 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -357,6 +357,107 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } +#ifdef CONFIG_EXEC_HUGETLB + +#define ELF_HPAGESIZE 0x200000 +#define ELF_HPAGESTART(_v) ((_v) & ~(unsigned long)(ELF_HPAGESIZE - 1)) +#define ELF_HPAGEOFFSET(_v) ((_v) & (ELF_HPAGESIZE - 1)) +#define ELF_HPAGEALIGN(_v) (((_v) + ELF_HPAGESIZE - 1) & ~(ELF_HPAGESIZE - 1)) + +static int elf_hugetlb_bss(unsigned long bss, unsigned long brk, int prot, + int type) +{ + unsigned long zero_byte = ELF_HPAGEOFFSET(bss); + struct user_struct *user = NULL; + struct file *huge_file; + int page_size_log = (MAP_HUGE_2MB >> MAP_HUGE_SHIFT) + & MAP_HUGE_MASK; + + if (zero_byte) { + zero_byte = ELF_HPAGESIZE - zero_byte; + if (clear_user((void __user *) bss, zero_byte)) + return -EFAULT; + } + bss = ELF_HPAGEALIGN(bss); + brk = ELF_HPAGEALIGN(brk); + if (brk > bss) { + unsigned long size = brk - bss; + + huge_file = hugetlb_file_setup(HUGETLB_ANON_FILE, size, + VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, + page_size_log); + if (IS_ERR(huge_file)) + return -ENOMEM; + bss = vm_mmap(huge_file, bss, size, prot, type, 0); + if (BAD_ADDR(bss)) + return -ENOMEM; + } + + return 0; +} + +static unsigned long elf_hugetlb_map(struct file *filep, unsigned long addr, + const struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long map_addr; + unsigned long elf_offset = ELF_PAGEOFFSET(eppnt->p_vaddr); + unsigned long size = eppnt->p_filesz + elf_offset; + unsigned long off = eppnt->p_offset - elf_offset; + int huge_flag = MAP_FILE_HUGETLB | MAP_HUGE_2MB; + + if (eppnt->p_align != ELF_HPAGESIZE) + return -EINVAL; + + if (total_size) { + total_size = ELF_HPAGEALIGN(total_size); + addr = vm_mmap(filep, addr, total_size, + PROT_NONE, type | huge_flag, 0); + if (BAD_ADDR(addr)) + return -ENOMEM; + vm_munmap(addr, total_size); + } + + addr = ELF_PAGESTART(addr); + map_addr = addr; + type |= MAP_FIXED_NOREPLACE; + + /* + * Addr of relro segment is not aligned. + * Glibc will change the protection of this segment, + * so we use normal mmap to avoid mprotect alignment error. + */ + if (addr != ELF_HPAGESTART(addr)) { + unsigned long size_4k = ELF_HPAGEALIGN(addr) - addr; + + addr = vm_mmap(filep, addr, size_4k, prot, type, off); + if (BAD_ADDR(addr)) + return -ENOMEM; + size = ELF_PAGEALIGN(size) - size_4k; + size = ELF_HPAGEALIGN(size); + addr += size_4k; + off += size_4k; + } else { + size = ELF_HPAGEALIGN(size); + } + + addr = vm_mmap(filep, addr, size, prot, type | huge_flag, off); + if (BAD_ADDR(addr)) + return -ENOMEM; + + if (eppnt->p_memsz > eppnt->p_filesz) { + addr = map_addr + elf_offset; + addr = elf_hugetlb_bss(addr + eppnt->p_filesz, + addr + eppnt->p_memsz, prot, type); + if (BAD_ADDR(addr)) + return -ENOMEM; + } + + return map_addr; +} + +#endif + static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -372,6 +473,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, if (!size) return addr; +#ifdef CONFIG_EXEC_HUGETLB + if (exec_hugetlb && (eppnt->p_flags & PF_HUGETLB)) + return elf_hugetlb_map(filep, addr, eppnt, prot, type, + total_size); +#endif + /* * total_size is the size of the ELF (interpreter) image. * The _first_ mmap needs to know the full size, otherwise @@ -1196,6 +1303,14 @@ static int load_elf_binary(struct linux_binprm *bprm) bss_prot = elf_prot; elf_brk = k; } +#ifdef CONFIG_EXEC_HUGETLB + /* + * bss is allocated in elf_hugetlb_bss, + * so skip vm_brk_flags in set_brk + */ + if (exec_hugetlb && (elf_ppnt->p_flags & PF_HUGETLB)) + elf_bss = elf_brk = ELF_HPAGEALIGN(elf_brk); +#endif } e_entry = elf_ex->e_entry + load_bias; diff --git a/fs/exec.c b/fs/exec.c index 4c2d18061633247d926e80b4176135c77145b23f..cf2077bffc0a019085a875297fb7cd55c8d40864 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -74,6 +74,10 @@ #include +#ifdef CONFIG_EXEC_HUGETLB +int exec_hugetlb; +#endif + static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7b8a513d9f69e6b66f8f4203fb64cdf15c84034d..391b967fcfbfdb69d2f4d8c3a48d6dff1742c353 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -280,6 +280,11 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) dev_t dev = 0; const char *name = NULL; +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) + file = vma->vm_actual_file; +#endif + if (file) { struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; diff --git a/include/linux/elf.h b/include/linux/elf.h index 5d5b0321da0bd81db3afa2bbe802252ca7e58a94..3e64eab033843932c613ed8e287f4c04301871e8 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -99,4 +99,8 @@ static inline int arch_elf_adjust_prot(int prot, } #endif +#ifdef CONFIG_EXEC_HUGETLB +extern int exec_hugetlb; +#endif + #endif /* _LINUX_ELF_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c22e294f083be2503b76eedf4dd77f13278daa4..9de02b116185054b47a7709afeb669b8259a3334 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -376,7 +376,11 @@ struct vm_area_struct { #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#if defined(CONFIG_ENHANCED_HUGETLB_MMAP) && !defined(__GENKSYMS__) + KABI_USE(1, struct file *vm_actual_file); +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index e75b65364dcefb3648e6099f39a135f065977d0e..2a396d81aca696429f041514ab1859eadb05ff99 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -31,6 +31,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_REPLACE 0x1000000 +#define MAP_FILE_HUGETLB 0x2000000 /* hugetlb private file map support */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 30f68b42eeb53f58b7140b519f07bc623cf43ba4..c48753e51cb45a89f393d3aa0b99540e9205daba 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -241,6 +241,7 @@ typedef struct elf64_hdr { #define PF_R 0x4 #define PF_W 0x2 #define PF_X 0x1 +#define PF_HUGETLB 0x1000000 typedef struct elf32_phdr{ Elf32_Word p_type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 68dc8a8e7990a97c675b547f593056630e46b59d..bbc770d8cbdb595c774854fe22d1158fad9ac6dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8119,6 +8119,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) { + /* perf will ignore hugetlb vma, so remove this flag */ + flags &= ~MAP_HUGETLB; + file = vma->vm_actual_file; + } +#endif if (file) { struct inode *inode; diff --git a/kernel/fork.c b/kernel/fork.c index 0fb86b65ae60ca5e1fc9fdc757fa4151ec98d7e3..c8ec029e158a645a5c8b4ca4a9f0cde184a294dd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -571,6 +571,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); } +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (tmp->vm_actual_file) + get_file(tmp->vm_actual_file); +#endif + /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d1243d1150b25790feb3ac718d62b7a00e77c2c9..a0df602c93729c40ce7b4160b3796ba1bb8d86a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3543,6 +3543,17 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, +#ifdef CONFIG_EXEC_HUGETLB + { + .procname = "exec-use-hugetlb", + .data = &exec_hugetlb, + .maxlen = sizeof(exec_hugetlb), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { } }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c5168c7f282af0e6890cde514a4aa9e6a0c46d17..817ae73d40bd669c4d891536c5619191c5e6b61e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4536,6 +4536,20 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_unlock_write(mapping); } +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP +static int read_actual_file(struct page *page, struct vm_area_struct *vma, + loff_t *off, size_t size) +{ + void *kaddr; + unsigned long read_size = 0; + + kaddr = kmap(page); + read_size = kernel_read(vma->vm_actual_file, kaddr, size, off); + kunmap(page); + return IS_ERR_VALUE(read_size) ? read_size : 0; +} +#endif + /* * Hugetlb_cow() should be called with page lock of the original hugepage held. * Called with hugetlb_instantiation_mutex held and pte_page locked so we @@ -4837,6 +4851,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) { + loff_t off = haddr - vma->vm_start + + (vma->vm_pgoff << PAGE_SHIFT); + size_t page_size = huge_page_size(h); + + ret = read_actual_file(page, vma, &off, page_size); + if (ret) + goto out; + } +#endif __SetPageUptodate(page); new_page = true; diff --git a/mm/mmap.c b/mm/mmap.c index 5489d70db84e35018de8b7c0f8cf22d1c3bea459..515d668e130170ff5656c76ac7d162dc34a091e7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,6 +188,10 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) vma->vm_ops->close(vma); if (vma->vm_file) fput(vma->vm_file); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) + fput(vma->vm_actual_file); +#endif mpol_put(vma_policy(vma)); sp_area_drop(vma); vm_area_free(vma); @@ -1849,6 +1853,17 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + /* + * glibc can use this flag to load libraries, + * a similar feature of exec_hugetlb. + */ + } else if (unlikely(flags & MAP_FILE_HUGETLB)) { + if (!(flags & MAP_PRIVATE)) { + retval = -EINVAL; + goto out_fput; + } +#endif } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; @@ -3047,6 +3062,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_file) get_file(new->vm_file); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (new->vm_actual_file) + get_file(new->vm_actual_file); +#endif + if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); diff --git a/mm/util.c b/mm/util.c index 67b350f4ffdc5f9e145ed27babea34c4b8b77821..05efa0b50be765957998cda99dc732cbba3362f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -496,6 +496,31 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP +static struct file *prepare_hugetlb_mmap(unsigned long flags, unsigned long size) +{ + int page_size_log = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + struct user_struct *user = NULL; + + return hugetlb_file_setup(HUGETLB_ANON_FILE, size, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, page_size_log); +} + +static unsigned long finish_hugetlb_mmap(unsigned long addr, struct file *actual_file, + struct file *huge_file) +{ + struct vm_area_struct *vma; + + fput(huge_file); + vma = find_vma(current->mm, addr); + if (!vma) + return -EINVAL; + vma->vm_actual_file = get_file(actual_file); + + return addr; +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -504,13 +529,28 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + struct file *actual_file = NULL; +#endif ret = security_mmap_file(file, prot, flag); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (flag & MAP_FILE_HUGETLB) { + actual_file = file; + file = prepare_hugetlb_mmap(flag, len + (pgoff << PAGE_SHIFT)); + if (IS_ERR(file)) + return PTR_ERR(file); + } +#endif if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, &uf); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (!IS_ERR_VALUE(addr) && (flag & MAP_FILE_HUGETLB)) + ret = finish_hugetlb_mmap(ret, actual_file, file); +#endif mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate)