diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 081a223bc65b1b1157e03c5c07bf1418d1a0b05a..9f79befaace58701c1739f2134b4116df3020dc2 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7810,6 +7810,9 @@ CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index f6140635690e2876c2aa499da11d6ca2164a492d..c69c0db09089b13e813f8da2eb1c25cd14a22cd1 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9046,6 +9046,9 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index cc6b8e087192e4c3b6901e5aab6471b1629d9849..f869dec42f343d440575d15c3343032743e26b79 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -60,6 +60,13 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +void copy_page_nocache(void *to, void *from); +void copy_page_nocache_barrir(void); + +struct folio; +#define __HAVE_ARCH_COPY_HUGEPAGES 1 +void copy_highpages(struct folio *dst, struct folio *src); + #ifdef CONFIG_X86_5LEVEL /* * User space process size. This is the first address outside the user range. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 01932af64193c14a3da15d86b92ca2ee332929ad..f3a8fa45c0101fa770c325db8a501518c0ca5160 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -73,4 +73,5 @@ endif lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o + lib-y += copy_highpages.o endif diff --git a/arch/x86/lib/copy_highpages.c b/arch/x86/lib/copy_highpages.c new file mode 100644 index 0000000000000000000000000000000000000000..d8357a938007583f4ffb30f0835b540d7d6b8f09 --- /dev/null +++ b/arch/x86/lib/copy_highpages.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * accelerate copying page to pmem with non-temproal stroes + */ +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(hugepage_nocache_copy); +#ifdef CONFIG_SYSCTL +static void set_hugepage_nocache_copy(bool enabled) +{ + if (enabled) + static_branch_enable(&hugepage_nocache_copy); + else + static_branch_disable(&hugepage_nocache_copy); +} + +int sysctl_hugepage_nocache_copy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + state = static_branch_unlikely(&hugepage_nocache_copy); + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_hugepage_nocache_copy(state); + return err; +} + +static struct ctl_table copy_highpages_table[] = { + { + .procname = "hugepage_nocache_copy", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0600, + .proc_handler = sysctl_hugepage_nocache_copy, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table copy_highpages_root_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = copy_highpages_table, + }, + {} +}; + +static __init int copy_highpages_init(void) +{ + return register_sysctl_table(copy_highpages_root_table) ? 0 : -ENOMEM; +} +__initcall(copy_highpages_init); +#endif + +static void copy_highpages_nocache(struct folio *dst, struct folio *src) +{ + char *vfrom, *vto; + int i; + int nr = folio_nr_pages(src); + + for (i = 0; i < nr; i++) { + cond_resched(); + vfrom = kmap_atomic(folio_page(src, i)); + vto = kmap_atomic(folio_page(dst, i)); + copy_page_nocache(vto, vfrom); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + } + copy_page_nocache_barrir(); +} + +static void copy_highpages_cache(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +void copy_highpages(struct folio *dst, struct folio *src) +{ + if (static_branch_unlikely(&hugepage_nocache_copy) && + get_node_type(page_to_nid(folio_page(dst, 0))) == NODE_TYPE_PMEM) + return copy_highpages_nocache(dst, src); + + return copy_highpages_cache(dst, src); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 30ea644bf446d3ae3556fdd8082d76264f7f7775..c31a9a79b18e8cd6d618075ef00a9c0a05402db8 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -87,3 +87,76 @@ SYM_FUNC_START_LOCAL(copy_page_regs) addq $2*8, %rsp RET SYM_FUNC_END(copy_page_regs) + +SYM_FUNC_START(copy_page_nocache) + ALTERNATIVE "jmp copy_page", "", X86_FEATURE_XMM2 + subq $2*8, %rsp + movq %rbx, (%rsp) + movq %r12, 1*8(%rsp) + + movl $(4096/64)-5, %ecx + .p2align 4 +.LoopNT64: + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .LoopNT64 + + movl $5, %ecx + .p2align 4 +.LoopNT2: + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movnti %rax, 0x8*0(%rdi) + movnti %rbx, 0x8*1(%rdi) + movnti %rdx, 0x8*2(%rdi) + movnti %r8, 0x8*3(%rdi) + movnti %r9, 0x8*4(%rdi) + movnti %r10, 0x8*5(%rdi) + movnti %r11, 0x8*6(%rdi) + movnti %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi + jnz .LoopNT2 + + movq (%rsp), %rbx + movq 1*8(%rsp), %r12 + addq $2*8, %rsp + ret +SYM_FUNC_END(copy_page_nocache) + +SYM_FUNC_START(copy_page_nocache_barrir) + ALTERNATIVE "", "sfence", X86_FEATURE_XMM2 + ret +SYM_FUNC_END(copy_page_nocache_barrir) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 1f4fc5f8a819d38364149a88664bde818ca0a4dc..3c5eae855e6a4808e08b5e2150c8e34c762bd923 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -278,6 +278,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); + if (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE) + set_node_type(node, NODE_TYPE_PMEM); + else + set_node_type(node, NODE_TYPE_DRAM); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1, diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8bad7937173183eb08634c9526a4e90..8704d41dd67cc06c35e8fd52b76a1cd5151c09b5 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,5 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 7183f338404d53932f439927551cf282286a0050..420e1d572856e13222eb1dba2af912a28708d141 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3354,6 +3354,10 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif @@ -3702,6 +3706,10 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c new file mode 100644 index 0000000000000000000000000000000000000000..06c202dcf1feb0b24d7208833c50284fd2d19136 --- /dev/null +++ b/fs/proc/etmem_scan.c @@ -0,0 +1,1382 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_ARM64 +#include +#include +#include +#include +#include +#endif +#include "etmem_scan.h" +#include + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->cpu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +#define RET_RESCAN_FLAG 0x10000 + +/* error return IDLE_PAGE_TYPE_MAX or return valid page type */ +enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, + unsigned long next, int depth, struct mm_walk *walk) = NULL; +EXPORT_SYMBOL_GPL(vm_handle_pte_hole); + +static int set_walk_step(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static struct kernel_param_ops walk_step_ops = { + .set = set_walk_step, + .get = param_get_uint, +}; + +static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE +module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); + +static unsigned int resched_step = 10; +module_param(resched_step, uint, 0644); + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int ret; + + dump_pic(pic); + + bytes_read = pic->pie_read; + if (!bytes_read) + return 0; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int vm_walk_host_range(unsigned long long start, + unsigned long end, + struct mm_walk *walk) +{ + int ret; + struct page_idle_ctrl *pic = walk->private; + unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; + + pic->gpa_to_hva = 0; + read_unlock(&pic->kvm->mmu_lock); + mmap_read_lock(walk->mm); + local_irq_disable(); + ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, + walk->ops, walk->private); + local_irq_enable(); + mmap_read_unlock(walk->mm); + pic->gpa_to_hva = tmp_gpa_to_hva; + if (pic->flags & VM_SCAN_HOST) { + pic->restart_gpa -= tmp_gpa_to_hva; + pic->flags &= ~VM_SCAN_HOST; + } + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + + /* ept page table may change after spin_unlock, rescan vm from root ept */ + ret |= RET_RESCAN_FLAG; + + return ret; +} + +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) { + err = vm_walk_host_range(addr, end, walk); + goto next; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); +next: + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + int accessed = PMD_IDLE; + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!KVM_CHECK_INVALID_SPTE(pte->pte)) + continue; + if (!ept_pte_present(*pte)) + continue; + if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)&pte->pte)) + continue; + accessed = PMD_ACCESSED; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return accessed; +} + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } else if (!pmd_large(*pmd)) { + if (pic->flags & SCAN_AS_HUGE) + page_type = ept_huge_accessed(pmd, addr, next); + else + page_type = pte_page_type; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next, walk); + +next: + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next, walk); + +next: + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + unsigned long next; + int err = 0; + + p4d += p4d_index(addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int ept_pgd_range(struct page_idle_ctrl *pic, + pgd_t *pgd, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + pgd = pgd_offset_pgd(pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + err = ept_p4d_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + uint64_t *ept_root; + int err = 0; + + WARN_ON(addr >= end); + + read_lock(&pic->kvm->mmu_lock); + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-VCPU"); + read_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root.hpa)) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-HPA"); + read_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + ept_root = __va(mmu->root.hpa); + + /* Walk start at p4d when vm has 4 level table pages */ + if (mmu->root_role.level != 4) + err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); + else + err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); + + /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock + * and RET_RESCAN_FLAG will be set in ret value + */ + if (!(err & RET_RESCAN_FLAG)) + read_unlock(&pic->kvm->mmu_lock); + else + err &= ~RET_RESCAN_FLAG; + + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + + read_lock(&kvm->mmu_lock); + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->root_role.level < 4 || + (mmu->root_role.level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); + ret = -EINVAL; + } else + ret = 0; + read_unlock(&kvm->mmu_lock); + + return ret; +} + +#else +static inline phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + + read_lock(&pic->kvm->mmu_lock); + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); + read_unlock(&pic->kvm->mmu_lock); + + local_irq_disable(); + do { + next = stage2_range_addr_end(addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + gfn_t gfn; + int bkt; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, bkt, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); +static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + + pic->flags |= VM_SCAN_HOST; + + /* hugetlb page table entry of vm maybe not present while page is resident + * in address_space + */ + if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && + vm_handle_pte_hole != NULL) { + page_type = vm_handle_pte_hole(addr, next, -1, walk); + if (page_type < IDLE_PAGE_TYPE_MAX) + return pic_add_page(pic, addr, next, page_type); + } + + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); +} + +static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType pagetype; + + if (vm_handle_pte_hole == NULL) + return 0; + + pagetype = vm_handle_pte_hole(addr, next, depth, walk); + if (pagetype >= IDLE_PAGE_TYPE_MAX) + return 0; + + debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); + pic->flags |= VM_SCAN_HOST; + return pic_add_page(pic, addr, next, pagetype); +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pmd_entry(pmd, addr, next, walk); +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pud_entry(pud, addr, next, walk); +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long gpa_addr; + unsigned long gpa_next; + unsigned long gpa_end; + unsigned long addr_range; + unsigned long va_end; + int ret; + int steps; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; + gpa_end = gpa_addr + addr_range; + steps = 0; + for (; gpa_addr < gpa_end;) { + gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); +#ifdef CONFIG_ARM64 + ret = arm_page_range(pic, gpa_addr, gpa_next); +#else + ret = ept_page_range(pic, gpa_addr, gpa_next, walk); +#endif + gpa_addr = pic->restart_gpa; + + if (ret) + break; + + if (++steps >= resched_step) { + cond_resched(); + steps = 0; + } + } + va_end = pic->gpa_to_hva + gpa_end; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk); +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk mm_walk = {}; + struct mm_walk_ops mm_walk_ops = {}; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + mm_walk_ops.pmd_entry = vm_idle_pmd_entry; + mm_walk_ops.pud_entry = vm_idle_pud_entry; + mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.pte_hole = vm_idle_pte_hole; + mm_walk_ops.test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = &mm_walk_ops; + mm_walk.private = pic; + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + kfree(pic); + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 + write_lock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); + write_unlock(&kvm->mmu_lock); +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PTE_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + unsigned long start = addr & hmask; /* hugepage may be splited in vm */ + int ret; + + if (mask_to_size(hmask) == PUD_SIZE) { + page_type = PUD_PRESENT; + goto add_page; + } + + if (!pte_present(*pte)) + page_type = PMD_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + +add_page: + ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); + return ret; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd) || + pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1; + + if (vma->vm_file) { + if (is_vm_hugetlb_page(vma)) + return 0; + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + mmap_read_lock(walk->mm); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + mmap_read_unlock(walk->mm); + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + debug_printk("nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case IDLE_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case IDLE_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; + proc_page_scan_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void page_scan_exit(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h new file mode 100644 index 0000000000000000000000000000000000000000..e109f7f350e1ae3d3d3cb06070e9ae9686ec3c1e --- /dev/null +++ b/fs/proc/etmem_scan.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#include + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +/* define to not used file flags */ +#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ +#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ +#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG) + +#define IDLE_SCAN_MAGIC 0x66 +#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) +#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int) + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c new file mode 100644 index 0000000000000000000000000000000000000000..86f5cf8c90a1d3811904b73c7a07106272997b18 --- /dev/null +++ b/fs/proc/etmem_swap.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RECLAIM_SWAPCACHE_MAGIC 0X77 +#define SET_SWAPCACHE_WMARK _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x02, unsigned int) +#define RECLAIM_SWAPCACHE_ON _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x01, unsigned int) +#define RECLAIM_SWAPCACHE_OFF _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x00, unsigned int) + +#define WATERMARK_MAX 100 +#define SWAP_SCAN_NUM_MAX 32 + +static struct task_struct *reclaim_swapcache_tk; +static bool enable_swapcache_reclaim; +static unsigned long swapcache_watermark[ETMEM_SWAPCACHE_NR_WMARK]; + +static DECLARE_WAIT_QUEUE_HEAD(reclaim_queue); + +static ssize_t swap_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *p, *data, *data_ptr_res; + unsigned long vaddr; + struct mm_struct *mm = file->private_data; + struct page *page; + LIST_HEAD(pagelist); + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out; + } + + if (count < 0) { + ret = -EOPNOTSUPP; + goto out_mm; + } + + data = memdup_user_nul(buf, count); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto out_mm; + } + + data_ptr_res = data; + while ((p = strsep(&data, "\n")) != NULL) { + if (!*p) + continue; + + ret = kstrtoul(p, 16, &vaddr); + if (ret != 0) + continue; + + /* If get page struct failed, ignore it, get next page */ + page = get_page_from_vaddr(mm, vaddr); + if (!page) + continue; + + add_page_for_swap(page, &pagelist); + } + + if (!list_empty(&pagelist)) + reclaim_pages(&pagelist); + + ret = count; + kfree(data_ptr_res); +out_mm: + mmput(mm); +out: + return ret; +} + +static int swap_pages_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int swap_pages_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +/* check if swapcache meet requirements */ +static bool swapcache_balanced(void) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH]; +} + +/* the flag present if swapcache reclaim is started */ +static bool swapcache_reclaim_enabled(void) +{ + return READ_ONCE(enable_swapcache_reclaim); +} + +static void start_swapcache_reclaim(void) +{ + if (swapcache_balanced()) + return; + /* RECLAIM_SWAPCACHE_ON trigger the thread to start running. */ + if (!waitqueue_active(&reclaim_queue)) + return; + + WRITE_ONCE(enable_swapcache_reclaim, true); + wake_up_interruptible(&reclaim_queue); +} + +static void stop_swapcache_reclaim(void) +{ + WRITE_ONCE(enable_swapcache_reclaim, false); +} + +static bool should_goto_sleep(void) +{ + if (swapcache_balanced()) + stop_swapcache_reclaim(); + + if (swapcache_reclaim_enabled()) + return false; + + return true; +} + +static int get_swapcache_watermark(unsigned int ratio) +{ + unsigned int low_watermark; + unsigned int high_watermark; + + low_watermark = ratio & 0xFF; + high_watermark = (ratio >> 8) & 0xFF; + if (low_watermark > WATERMARK_MAX || + high_watermark > WATERMARK_MAX || + low_watermark > high_watermark) + return -EPERM; + + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] = totalram_pages() * + low_watermark / WATERMARK_MAX; + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH] = totalram_pages() * + high_watermark / WATERMARK_MAX; + + return 0; +} + +extern struct file_operations proc_swap_pages_operations; + +static void reclaim_swapcache_try_to_sleep(void) +{ + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&reclaim_queue, &wait, TASK_INTERRUPTIBLE); + if (should_goto_sleep()) { + if (!kthread_should_stop()) + schedule(); + } + finish_wait(&reclaim_queue, &wait); +} + +static void etmem_reclaim_swapcache(void) +{ + do_swapcache_reclaim(swapcache_watermark, + ARRAY_SIZE(swapcache_watermark)); + stop_swapcache_reclaim(); +} + +static int reclaim_swapcache_proactive(void *para) +{ + set_freezable(); + + while (1) { + bool ret; + + reclaim_swapcache_try_to_sleep(); + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + if (ret) + continue; + + etmem_reclaim_swapcache(); + } + + return 0; +} + +static int reclaim_swapcache_run(void) +{ + int ret = 0; + + reclaim_swapcache_tk = kthread_run(reclaim_swapcache_proactive, NULL, + "etmem_recalim_swapcache"); + if (IS_ERR(reclaim_swapcache_tk)) { + ret = PTR_ERR(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return ret; +} + +static long swap_page_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int ratio; + + switch (cmd) { + case RECLAIM_SWAPCACHE_ON: + if (swapcache_reclaim_enabled()) + return 0; + start_swapcache_reclaim(); + break; + case RECLAIM_SWAPCACHE_OFF: + stop_swapcache_reclaim(); + break; + case SET_SWAPCACHE_WMARK: + if (get_user(ratio, (unsigned int __user *)argp)) + return -EFAULT; + + if (get_swapcache_watermark(ratio) != 0) + return -EFAULT; + break; + default: + return -EPERM; + } + + return 0; +} + +static int swap_pages_entry(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.unlocked_ioctl = swap_page_ioctl; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + enable_swapcache_reclaim = false; + reclaim_swapcache_run(); + + return 0; +} + +static void swap_pages_exit(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.unlocked_ioctl = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + if (!IS_ERR(reclaim_swapcache_tk)) { + kthread_stop(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return; +} + +MODULE_LICENSE("GPL"); +module_init(swap_pages_entry); +module_exit(swap_pages_exit); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0d98766b940701dd47a5034f2f463..24f74abfcbc40fa47c2e91e3487cba5784b52170 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,10 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +#ifdef CONFIG_ETMEM +extern const struct file_operations proc_mm_idle_operations; +extern const struct file_operations proc_mm_swap_operations; +#endif extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545ecc1891082866c58ac8c5ffcf088..f1f3b03e1867811d819d3d76e426177615d4a621 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1770,6 +1771,199 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release = pagemap_release, }; + +#ifdef CONFIG_ETMEM +static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_page_scan_operations.open) + return proc_page_scan_operations.open(inode, file); + + return 0; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + ret = proc_page_scan_operations.release(inode, file); + + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; +} + +static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, +}; + +static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} +/*swap pages*/ +struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, +}; +EXPORT_SYMBOL_GPL(proc_swap_pages_operations); + +static ssize_t mm_swap_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_swap_pages_operations.write) + return proc_swap_pages_operations.write(file, buf, count, ppos); + + return -1; +} + +static int mm_swap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_swap_pages_operations.open) + return proc_swap_pages_operations.open(inode, file); + + return 0; +} + +static int mm_swap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) + mmdrop(mm); + + if (proc_swap_pages_operations.release) + ret = proc_swap_pages_operations.release(inode, file); + + if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + + return ret; +} + +static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); + return 0; +} + +const struct file_operations proc_mm_swap_operations = { + .llseek = mem_lseek, + .write = mm_swap_write, + .open = mm_swap_open, + .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, +}; +#endif #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 4de1dbcd3ef642f5a39b4e02fd54073933db237d..4cf084e6371b3d4ab16690298f711ecc58473948 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -513,4 +513,21 @@ static inline void put_and_unmap_page(struct page *page, void *addr) put_page(page); } +#ifndef __HAVE_ARCH_COPY_HUGEPAGES + +static inline void copy_highpages(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +#endif /* __HAVE_ARCH_COPY_HUGEPAGES */ + #endif /* _LINUX_HIGHMEM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d041aa9f0feb32c61f507097708b13dce9db6ec..1c5b9123378f78cbb36162c9a6b630af566871df 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -181,6 +181,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; +#ifdef CONFIG_HUGETLBFS +extern int sysctl_hugetlb_mig_noalloc; +extern int sysctl_hugetlb_pmem_allocall; +#endif + + /* arch callbacks */ #ifndef CONFIG_HIGHPTE diff --git a/include/linux/list.h b/include/linux/list.h index f10344dbad4dfe5f51f7cce825884d4923c66c08..f946aae6e8ab05420e6e4bf5e2191fa33d4aeeac 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -825,6 +825,23 @@ static inline size_t list_count_nodes(struct list_head *head) !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) +/** + * list_for_each_entry_safe_reverse_from - iterate backwards over list from + * current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_reverse_from(pos, n, head, member) \ + for (n = list_prev_entry(pos, member); \ + !list_entry_is_head(pos, head, member); \ + pos = n, n = list_prev_entry(n, member)) + + /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop diff --git a/include/linux/mm.h b/include/linux/mm.h index 50f04282efcb1ecb76cf76e43031ee33310dff67..15eb40f2cd97b27c28512aa9b0f82c34336dfbb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -313,6 +313,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#ifdef CONFIG_ETMEM +#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ +#endif + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cac73ccf7367a3e7fd876b3fc0d640ec981a1ad7..98a13fb411bfd8b366f2a3fab4d10f9986237ac8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -35,7 +35,7 @@ struct address_space; struct mem_cgroup; - +struct kvm; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -849,6 +849,9 @@ struct mm_struct { #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_GMEM gm_as_t *gm_as; +#endif +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; #endif } __randomize_layout; @@ -863,6 +866,18 @@ struct mm_struct { MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { diff --git a/include/linux/numa.h b/include/linux/numa.h index 59df211d051fa8373faccf0fca20c293ea90124c..fdcd888f70cd3d81a0ced36dcdc83419c4f1311d 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -20,6 +20,11 @@ #define __initdata_or_meminfo __initdata #endif +enum node_type { + NODE_TYPE_DRAM, + NODE_TYPE_PMEM, +}; + #ifdef CONFIG_NUMA #include #include @@ -43,6 +48,8 @@ static inline int phys_to_target_node(u64 start) return 0; } #endif +void set_node_type(int nid, enum node_type type); +enum node_type get_node_type(int nid); #else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) { @@ -56,6 +63,11 @@ static inline int phys_to_target_node(u64 start) { return 0; } +static inline enum node_type get_node_type(int nid) +{ + return NODE_TYPE_DRAM; +} +static inline void set_node_type(int nid, enum node_type type) {} #endif #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP diff --git a/include/linux/swap.h b/include/linux/swap.h index 3c69cb653cb90f6007815b1e2e14fcd3a55d3f81..f620decea34e1c48b085ede751579f7f6647281c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -442,6 +442,44 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); +#ifdef CONFIG_ETMEM +enum etmem_swapcache_watermark_en { + ETMEM_SWAPCACHE_WMARK_LOW, + ETMEM_SWAPCACHE_WMARK_HIGH, + ETMEM_SWAPCACHE_NR_WMARK +}; + +extern struct kobj_attribute kernel_swap_enable_attr; +extern int add_page_for_swap(struct page *page, struct list_head *pagelist); +extern struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr); +extern int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr); +extern bool kernel_swap_enabled(void); +#else +static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + return 0; +} + +static inline struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr) +{ + return NULL; +} + +static inline int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + return 0; +} + +static inline bool kernel_swap_enabled(void) +{ + return true; +} +#endif + #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; @@ -715,6 +753,7 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) return vm_swap_full(); } #endif +extern unsigned long reclaim_pages(struct list_head *folio_list); #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 9f6ee16d18847cb34173cc3561e0be0d67ffc7ce..24c4be60f7133000d2fb5088ad91f376d049669b 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -86,6 +86,10 @@ #define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ #define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ +#define MADV_ETMEM_BASE 0x1100 +#define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/Kconfig b/mm/Kconfig index b950407dd87fd9d814b891bad3138967c4c33cb7..2b69c1b3d8a73b51c7a658bbb55dbba23db0bc82 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1215,6 +1215,31 @@ config GMEM help say Y here to enable gmem subsystem +config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" + depends on ETMEM + help + etmem page scan feature + used to scan the virtual address of the target process + +config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" + depends on ETMEM + help + etmem page swap feature + +config ETMEM + bool "Enable etmem feature" + depends on MMU + depends on X86 || ARM64 + default n + help + etmem is a tiered memory extension technology that uses DRAM and memory + compression/high-performance storage media to form tiered memory storage. + Memory data is tiered, and cold data is migrated from memory media to + high-performance storage media to release memory space and reduce + memory costs. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 0824907eab98e9bc9e8f856c8e071bdd8fcba3cb..cc147c0d7ca0c0faaf7a86c174bdd9aecf7b431e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 0000000000000000000000000000000000000000..4187fe7eef0c660638141cb7f053f07f87a44780 --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static bool enable_kernel_swap __read_mostly = true; + +bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} + +struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP | FOLL_FORCE; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +static int add_page_for_reclaim_swapcache(struct page *page, + struct list_head *pagelist, struct lruvec *lruvec, enum lru_list lru) +{ + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EINVAL; + + if (PageHuge(page)) + return -EINVAL; + + head = compound_head(page); + if (!PageLRU(head) || PageUnevictable(head)) + return -EBUSY; + + if (unlikely(!get_page_unless_zero(page))) + return -EBUSY; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + return -EBUSY; + } + + list_move(&head->lru, pagelist); + update_lru_size(lruvec, lru, page_zonenum(head), -thp_nr_pages(head)); + + return 0; +} + +static unsigned long reclaim_swapcache_pages_from_list(int nid, + struct list_head *page_list, unsigned long reclaim_num, bool putback_flag) +{ + unsigned long nr_reclaimed = 0; + unsigned long nr_moved = 0; + struct page *page, *next; + LIST_HEAD(swap_pages); + struct pglist_data *pgdat = NULL; + + pgdat = NODE_DATA(nid); + + if (putback_flag) + goto putback_list; + + if (reclaim_num == 0) + return 0; + + list_for_each_entry_safe(page, next, page_list, lru) { + if (!page_is_file_lru(page) && !__PageMovable(page) + && PageSwapCache(page)) { + ClearPageActive(page); + list_move(&page->lru, &swap_pages); + nr_moved++; + } + + if (nr_moved >= reclaim_num) + break; + } + + /* swap the pages */ + if (pgdat) + nr_reclaimed = reclaim_pages(&swap_pages); + + return nr_reclaimed; + +putback_list: + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +#define SWAP_SCAN_NUM_MAX 32 + +static bool swapcache_below_watermark(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]; +} + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +/* + * The main function to reclaim swapcache, the whole reclaim process is + * divided into 3 steps. + * 1. get the total_swapcache_pages num to reclaim. + * 2. scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + * 3. reclaim the swapcache page until the requirements are meet. + */ +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int err = -EINVAL; + unsigned long swapcache_to_reclaim = 0; + unsigned long nr_reclaimed = 0; + unsigned long swapcache_total_reclaimable = 0; + unsigned long reclaim_page_count = 0; + + unsigned long *nr = NULL; + unsigned long *nr_to_reclaim = NULL; + struct list_head *swapcache_list = NULL; + + int nid = 0; + struct lruvec *lruvec = NULL; + struct list_head *src = NULL; + struct page *page = NULL; + struct page *next = NULL; + struct page *pos = NULL; + + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = NULL; + + pg_data_t *pgdat = NULL; + unsigned int scan_count = 0; + int nid_num = 0; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return err; + + /* get the total_swapcache_pages num to reclaim. */ + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + if (swapcache_to_reclaim <= 0) + return err; + + nr = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr == NULL) + return -ENOMEM; + + nr_to_reclaim = kcalloc(MAX_NUMNODES, sizeof(unsigned long), GFP_KERNEL); + if (nr_to_reclaim == NULL) { + kfree(nr); + return -ENOMEM; + } + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) { + kfree(nr); + kfree(nr_to_reclaim); + return -ENOMEM; + } + + /* + * scan the LRU linked list of each memory node to obtain the + * swapcache pages that can be reclaimd. + */ + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid_num]); + cond_resched(); + + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + pos = NULL; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + spin_lock_irq(&lruvec->lru_lock); + scan_count = 0; + + /* + * Scan the swapcache pages that are not mapped from + * the end of the LRU linked list, scan SWAP_SCAN_NUM_MAX + * pages each time, and record the scan end point page. + */ + + pos = list_last_entry(src, struct page, lru); + spin_unlock_irq(&lruvec->lru_lock); +do_scan: + cond_resched(); + scan_count = 0; + spin_lock_irq(&lruvec->lru_lock); + + /* + * check if pos page is been released or not in LRU list, if true, + * cancel the subsequent page scanning of the current node. + */ + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!PageLRU(pos) || folio_lru_list(page_folio(pos)) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + page = pos; + pos = NULL; + /* Continue to scan down from the last scan breakpoint */ + list_for_each_entry_safe_reverse_from(page, next, src, lru) { + scan_count++; + pos = next; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!PageSwapCache(page)) + continue; + + if (page_mapped(page)) + continue; + + if (add_page_for_reclaim_swapcache(page, + &swapcache_list[nid_num], + lruvec, LRU_INACTIVE_ANON) != 0) + continue; + + nr[nid_num]++; + swapcache_total_reclaimable++; + } + spin_unlock_irq(&lruvec->lru_lock); + + /* + * Check whether the scanned pages meet + * the reclaim requirements. + */ + if (swapcache_total_reclaimable <= swapcache_to_reclaim || + scan_count >= SWAP_SCAN_NUM_MAX) + goto do_scan; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + + /* Start reclaiming the next memory node. */ + nid_num++; + } + + /* reclaim the swapcache page until the requirements are meet. */ + do { + nid_num = 0; + reclaim_page_count = 0; + + /* start swapcache page reclaim for each node. */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + + nr_to_reclaim[nid_num] = (swapcache_total_reclaimable == 0) ? 0 : + ((swapcache_to_reclaim * nr[nid_num]) / + swapcache_total_reclaimable); + + reclaim_page_count += reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], + nr_to_reclaim[nid_num], false); + nid_num++; + } + + nr_reclaimed += reclaim_page_count; + + /* + * Check whether the swapcache page reaches the reclaim requirement or + * the number of the swapcache page reclaimd is 0. Stop reclaim. + */ + if (nr_reclaimed >= swapcache_to_reclaim || reclaim_page_count == 0) + goto exit; + } while (!swapcache_below_watermark(swapcache_watermark) || + nr_reclaimed < swapcache_to_reclaim); +exit: + nid_num = 0; + /* + * Repopulate the swapcache pages that are not reclaimd back + * to the LRU linked list. + */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_swapcache_pages_from_list(nid, + &swapcache_list[nid_num], 0, true); + nid_num++; + } + + kfree(nr); + kfree(nr_to_reclaim); + kfree(swapcache_list); + + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a55c88ba305df4956b4324ec2be71cc892fc7bff..7f90f8fb6b0cf4e173cbb31f55d4b973475404ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2051,6 +2051,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spin_unlock(ptl); return NULL; } +EXPORT_SYMBOL_GPL(__pud_trans_huge_lock); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c48a5b3a3688cb7cdc1179f58d140c978e11ba64..ccde9f048d6bb300b812192c05060579b8ca7bc8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -90,6 +90,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +int sysctl_hugetlb_mig_noalloc; +int sysctl_hugetlb_pmem_allocall; + /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); @@ -2219,6 +2222,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (get_node_type(node) == NODE_TYPE_PMEM && sysctl_hugetlb_pmem_allocall) + gfp_mask |= __GFP_MEMALLOC; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) { @@ -2487,7 +2492,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); - if (folio) { + if (folio || sysctl_hugetlb_mig_noalloc) { spin_unlock_irq(&hugetlb_lock); return folio; } @@ -4739,6 +4744,26 @@ static struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#ifdef CONFIG_HUGETLBFS + { + .procname = "hugepage_mig_noalloc", + .data = &sysctl_hugetlb_mig_noalloc, + .maxlen = sizeof(sysctl_hugetlb_mig_noalloc), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hugepage_pmem_allocall", + .data = &sysctl_hugetlb_pmem_allocall, + .maxlen = sizeof(sysctl_hugetlb_pmem_allocall), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { } }; diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97aca9d801b4fd8a8ef6b7070c2aa70d..ba568b48072c799bf04c393d7a01bb6c553100e3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -783,7 +783,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ diff --git a/mm/madvise.c b/mm/madvise.c index b5ffbaf616f51cfb5a529f3306204f975b91b5ed..250d0e8e9a0d8a8cd120b4a075320d78192d7a90 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1072,6 +1072,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; +#endif } anon_name = anon_vma_name(vma); @@ -1174,9 +1182,12 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: +#endif +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: #endif return true; - default: return false; } @@ -1368,6 +1379,10 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required + * MADV_SWAPFLAG - Used in the etmem memory extension feature, the process + * specifies the memory swap area by adding a flag to a specific + * vma address. + * MADV_SWAPFLAG_REMOVE - remove the specific vma flag * * return values: * zero - success diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90762bee97306f48a3d5329b9b2292fb3a3c953c..8ab70473a2d0dc4867de48959e6c26dd537d15d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7218,3 +7218,16 @@ bool has_managed_dma(void) return false; } #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_NUMA +enum node_type nodes_type[MAX_NUMNODES]; + +void set_node_type(int nid, enum node_type type) +{ + nodes_type[nid] = type; +} +enum node_type get_node_type(int nid) +{ + return nodes_type[nid]; +} +#endif diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb23f8a15c134af536b9464e425660aba3f5ea6d..0d39aebb432ec37893809c74a4edcc5244e5e4d2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -481,6 +481,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +EXPORT_SYMBOL_GPL(walk_page_range); /** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/mm/swap_state.c b/mm/swap_state.c index b76a65ac28b319ca2a73b33a430a0195e8584b62..4ce292e2aea380d9d7b30059fd90e54e646185c5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -871,6 +871,9 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, +#ifdef CONFIG_ETMEM + &kernel_swap_enable_attr.attr, +#endif NULL, }; diff --git a/mm/util.c b/mm/util.c index dd12b9531ac4cad5f37879c2412719b3724ecb78..6593ad7b84eea0186d97d7985a55f6d8e825053e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -792,15 +793,7 @@ EXPORT_SYMBOL(folio_mapping); */ void folio_copy(struct folio *dst, struct folio *src) { - long i = 0; - long nr = folio_nr_pages(src); - - for (;;) { - copy_highpage(folio_page(dst, i), folio_page(src, i)); - if (++i == nr) - break; - cond_resched(); - } + copy_highpages(dst, src); } int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5bf98d0a22c9a7107b76040b49fccd6aec353128..f981539e59aebe92b2b218a3c4b76401b525cdca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -2814,6 +2815,7 @@ unsigned long reclaim_pages(struct list_head *folio_list) return nr_reclaimed; } +EXPORT_SYMBOL_GPL(reclaim_pages); static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) @@ -6981,6 +6983,18 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, return false; } +/* + * Check if original kernel swap is enabled + * turn off kernel swap,but leave page cache reclaim on + */ +static inline void kernel_force_no_swap(struct scan_control *sc) +{ +#ifdef CONFIG_ETMEM + if (sc != NULL && !kernel_swap_enabled()) + sc->may_swap = 0; +#endif +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -6997,6 +7011,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_swap = 1, }; + kernel_force_no_swap(&sc); /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. @@ -7434,6 +7449,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; + kernel_force_no_swap(&sc); + /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated @@ -7812,6 +7829,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state); + kernel_force_no_swap(&sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); set_task_reclaim_state(current, NULL); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 65f94f592ff88380d0b6af736600eab0f5a38cbb..905da44be082e4f4e9b224ec74ce0b2a2e775a81 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1297,6 +1297,9 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_KVM) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5054,6 +5057,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ +#if IS_ENABLED(CONFIG_KVM) + kvm->mm->kvm = kvm; +#endif kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(fd, file);