diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 35dd6141f057ad6e162b8bdce2a533b8ba0ae2ca..1abe8fbb6057b8aa731b17da8a232023c2a6e286 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1152,6 +1152,9 @@ CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_MEMORY_RELIABLE=y CONFIG_DYNAMIC_POOL=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # # Data Access Monitoring diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index dfefc129ed0eb7a77a3dabecd2112128185d9333..1bf9fbf4ea59dbeef4fd28e1851d2bf8e19c399f 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1167,6 +1167,9 @@ CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_DYNAMIC_POOL=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM_SWAP=m +CONFIG_ETMEM=y # # Data Access Monitoring diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 70dca85a5861d9d379955b199e33788924dd1751..daa43e10b40b60edd7b9751ce051b3acdeccca6c 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -35,3 +35,6 @@ proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o proc-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o +proc-${CONFIG_ETMEM} += etmem_proc.o diff --git a/fs/proc/base.c b/fs/proc/base.c index e04b0126334f991775223b2b1149f9447ac712f2..c9d792103c2ff0f4ad9f03b307f27f13a4ee7694 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3366,6 +3366,10 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif @@ -3718,6 +3722,10 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), + REG("swap_pages", S_IWUSR, proc_mm_swap_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif diff --git a/fs/proc/etmem_proc.c b/fs/proc/etmem_proc.c new file mode 100644 index 0000000000000000000000000000000000000000..2e6712cc43b26051b40c0172d9554604a5ae32e3 --- /dev/null +++ b/fs/proc/etmem_proc.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_page_scan_operations.open) + ret = proc_page_scan_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + ret = proc_page_scan_operations.release(inode, file); + + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; +} + +static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, +}; + +static DEFINE_SPINLOCK(swap_lock); + +static int page_swap_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&swap_lock); + else + spin_unlock(&swap_lock); + + return 0; +} +/*swap pages*/ +struct file_operations proc_swap_pages_operations = { + .flock = page_swap_lock, +}; +EXPORT_SYMBOL_GPL(proc_swap_pages_operations); + +static ssize_t mm_swap_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_swap_pages_operations.write) + return proc_swap_pages_operations.write(file, buf, count, ppos); + + return -1; +} + +static int mm_swap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_swap_lock(NULL, 1, NULL); + module = proc_swap_pages_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_swap_lock(NULL, 0, NULL); + if (ret != 0) { + /* no swap ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_swap_pages_operations.open) + ret = proc_swap_pages_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_swap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) + mmdrop(mm); + + if (proc_swap_pages_operations.release) + ret = proc_swap_pages_operations.release(inode, file); + + if (proc_swap_pages_operations.owner) + module_put(proc_swap_pages_operations.owner); + + return ret; +} + +static long mm_swap_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_swap_pages_operations.unlocked_ioctl) + return proc_swap_pages_operations.unlocked_ioctl(filp, cmd, arg); + return 0; +} + +const struct file_operations proc_mm_swap_operations = { + .llseek = mem_lseek, + .write = mm_swap_write, + .open = mm_swap_open, + .release = mm_swap_release, + .unlocked_ioctl = mm_swap_ioctl, +}; diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c new file mode 100644 index 0000000000000000000000000000000000000000..1b5436d50e4709ad3ef6869d71abe73d04316663 --- /dev/null +++ b/fs/proc/etmem_scan.c @@ -0,0 +1,1383 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_ARM64 +#include +#include +#include +#include +#include +#endif +#include "etmem_scan.h" +#include + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->cpu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +#define RET_RESCAN_FLAG 0x10000 + +/* error return IDLE_PAGE_TYPE_MAX or return valid page type */ +enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, + unsigned long next, int depth, struct mm_walk *walk) = NULL; +EXPORT_SYMBOL_GPL(vm_handle_pte_hole); + +static int set_walk_step(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static struct kernel_param_ops walk_step_ops = { + .set = set_walk_step, + .get = param_get_uint, +}; + +static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE +module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); + +static unsigned int resched_step = 10; +module_param(resched_step, uint, 0644); + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int ret; + + dump_pic(pic); + + bytes_read = pic->pie_read; + if (!bytes_read) + return 0; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int vm_walk_host_range(unsigned long long start, + unsigned long end, + struct mm_walk *walk) +{ + int ret; + struct page_idle_ctrl *pic = walk->private; + unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; + + pic->gpa_to_hva = 0; + read_unlock(&pic->kvm->mmu_lock); + mmap_read_lock(walk->mm); + local_irq_disable(); + ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, + walk->ops, walk->private); + local_irq_enable(); + mmap_read_unlock(walk->mm); + pic->gpa_to_hva = tmp_gpa_to_hva; + if (pic->flags & VM_SCAN_HOST) { + pic->restart_gpa -= tmp_gpa_to_hva; + pic->flags &= ~VM_SCAN_HOST; + } + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + + /* ept page table may change after spin_unlock, rescan vm from root ept */ + ret |= RET_RESCAN_FLAG; + + return ret; +} + +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) { + err = vm_walk_host_range(addr, end, walk); + goto next; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); +next: + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + int accessed = PMD_IDLE; + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!KVM_CHECK_INVALID_SPTE(pte->pte)) + continue; + if (!ept_pte_present(*pte)) + continue; + if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)&pte->pte)) + continue; + accessed = PMD_ACCESSED; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return accessed; +} + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } else if (!pmd_large(*pmd)) { + if (pic->flags & SCAN_AS_HUGE) + page_type = ept_huge_accessed(pmd, addr, next); + else + page_type = pte_page_type; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next, walk); + +next: + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next, walk); + +next: + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + unsigned long next; + int err = 0; + + p4d += p4d_index(addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int ept_pgd_range(struct page_idle_ctrl *pic, + pgd_t *pgd, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + pgd = pgd_offset_pgd(pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + err = ept_p4d_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + uint64_t *ept_root; + int err = 0; + + WARN_ON(addr >= end); + + read_lock(&pic->kvm->mmu_lock); + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-VCPU"); + read_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root.hpa)) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-HPA"); + read_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + ept_root = __va(mmu->root.hpa); + + /* Walk start at p4d when vm has 4 level table pages */ + if (mmu->root_role.level != 4) + err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); + else + err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); + + /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock + * and RET_RESCAN_FLAG will be set in ret value + */ + if (!(err & RET_RESCAN_FLAG)) + read_unlock(&pic->kvm->mmu_lock); + else + err &= ~RET_RESCAN_FLAG; + + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + + read_lock(&kvm->mmu_lock); + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->root_role.level < 4 || + (mmu->root_role.level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); + ret = -EINVAL; + } else + ret = 0; + read_unlock(&kvm->mmu_lock); + + return ret; +} + +#else +static inline phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud = (pud_t *)p4d; + unsigned long next; + int err = 0; + + pud += pud_index(addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + + read_lock(&pic->kvm->mmu_lock); + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr) * PTRS_PER_PTE; + read_unlock(&pic->kvm->mmu_lock); + + local_irq_disable(); + do { + next = pgd_addr_end(addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + gfn_t gfn; + int bkt; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, bkt, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); +static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + + pic->flags |= VM_SCAN_HOST; + + /* hugetlb page table entry of vm maybe not present while page is resident + * in address_space + */ + if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && + vm_handle_pte_hole != NULL) { + page_type = vm_handle_pte_hole(addr, next, -1, walk); + if (page_type < IDLE_PAGE_TYPE_MAX) + return pic_add_page(pic, addr, next, page_type); + } + + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); +} + +static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType pagetype; + + if (vm_handle_pte_hole == NULL) + return 0; + + pagetype = vm_handle_pte_hole(addr, next, depth, walk); + if (pagetype >= IDLE_PAGE_TYPE_MAX) + return 0; + + debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); + pic->flags |= VM_SCAN_HOST; + return pic_add_page(pic, addr, next, pagetype); +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pmd_entry(pmd, addr, next, walk); +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pud_entry(pud, addr, next, walk); +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long gpa_addr; + unsigned long gpa_next; + unsigned long gpa_end; + unsigned long addr_range; + unsigned long va_end; + int ret; + int steps; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; + gpa_end = gpa_addr + addr_range; + steps = 0; + for (; gpa_addr < gpa_end;) { + gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); +#ifdef CONFIG_ARM64 + ret = arm_page_range(pic, gpa_addr, gpa_next); +#else + ret = ept_page_range(pic, gpa_addr, gpa_next, walk); +#endif + gpa_addr = pic->restart_gpa; + + if (ret) + break; + + if (++steps >= resched_step) { + cond_resched(); + steps = 0; + } + } + va_end = pic->gpa_to_hva + gpa_end; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk); +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk mm_walk = {}; + struct mm_walk_ops mm_walk_ops = {}; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + mm_walk_ops.pmd_entry = vm_idle_pmd_entry; + mm_walk_ops.pud_entry = vm_idle_pud_entry; + mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.pte_hole = vm_idle_pte_hole; + mm_walk_ops.test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = &mm_walk_ops; + mm_walk.private = pic; + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + kfree(pic); + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 + write_lock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); + write_unlock(&kvm->mmu_lock); +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PTE_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + unsigned long start = addr & hmask; /* hugepage may be splited in vm */ + int ret; + + if (mask_to_size(hmask) == PUD_SIZE) { + page_type = PUD_PRESENT; + goto add_page; + } + + if (!pte_present(*pte)) + page_type = PMD_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + +add_page: + ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); + return ret; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd) || + pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1; + + if (vma->vm_file) { + if (is_vm_hugetlb_page(vma)) + return 0; + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + mmap_read_lock(walk->mm); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + mmap_read_unlock(walk->mm); + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + debug_printk("nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case IDLE_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case IDLE_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; + proc_page_scan_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void page_scan_exit(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h new file mode 100644 index 0000000000000000000000000000000000000000..e109f7f350e1ae3d3d3cb06070e9ae9686ec3c1e --- /dev/null +++ b/fs/proc/etmem_scan.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#include + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +/* define to not used file flags */ +#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ +#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ +#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG) + +#define IDLE_SCAN_MAGIC 0x66 +#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) +#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int) + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c new file mode 100644 index 0000000000000000000000000000000000000000..4aad6b9db9a6bf5b8c9720fd63c7e0138b670ee1 --- /dev/null +++ b/fs/proc/etmem_swap.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ssize_t swap_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char *p, *data, *data_ptr_res; + unsigned long vaddr; + struct mm_struct *mm = file->private_data; + struct page *page; + LIST_HEAD(pagelist); + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out; + } + + if (count < 0) { + ret = -EOPNOTSUPP; + goto out_mm; + } + + data = memdup_user_nul(buf, count); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto out_mm; + } + + data_ptr_res = data; + while ((p = strsep(&data, "\n")) != NULL) { + if (!*p) + continue; + + ret = kstrtoul(p, 16, &vaddr); + if (ret != 0) + continue; + + /* If get page struct failed, ignore it, get next page */ + page = get_page_from_vaddr(mm, vaddr); + if (!page) + continue; + + add_page_for_swap(page, &pagelist); + } + + if (!list_empty(&pagelist)) + reclaim_pages(&pagelist); + + ret = count; + kfree(data_ptr_res); +out_mm: + mmput(mm); +out: + return ret; +} + +static int swap_pages_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int swap_pages_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +extern struct file_operations proc_swap_pages_operations; + +static int swap_pages_entry(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = THIS_MODULE; + proc_swap_pages_operations.write = swap_pages_write; + proc_swap_pages_operations.open = swap_pages_open; + proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void swap_pages_exit(void) +{ + proc_swap_pages_operations.flock(NULL, 1, NULL); + proc_swap_pages_operations.owner = NULL; + proc_swap_pages_operations.write = NULL; + proc_swap_pages_operations.open = NULL; + proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(swap_pages_entry); +module_exit(swap_pages_exit); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff569d0dc40e1d7c31b63b9aea293bc..be6d5dfc330cc161ff40bc7ea779315be7614e64 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,10 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +#ifdef CONFIG_ETMEM +extern const struct file_operations proc_mm_idle_operations; +extern const struct file_operations proc_mm_swap_operations; +#endif extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/include/linux/etmem.h b/include/linux/etmem.h new file mode 100644 index 0000000000000000000000000000000000000000..9ec9657e56ed0531fe33bdf97a7cb48aa1e52f5b --- /dev/null +++ b/include/linux/etmem.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_ETMEM_H_ +#define __MM_ETMEM_H_ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_ETMEM + +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + +extern int add_page_for_swap(struct page *page, struct list_head *pagelist); +extern struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr); +extern struct kobj_attribute kernel_swap_enable_attr; +extern bool kernel_swap_enabled(void); +#else /* !CONFIG_ETMEM */ +static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + return 0; +} + +static inline struct page *get_page_from_vaddr(struct mm_struct *mm, + unsigned long vaddr) +{ + return NULL; +} + +static inline bool kernel_swap_enabled(void) +{ + return true; +} +#endif /* #ifdef CONFIG_ETMEM */ +#endif /* define __MM_ETMEM_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 48a6b0865175b0ef398eebdd7448092a4265fe95..d4bd079ca9491ba4994444803f83fe2f4e10051a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,6 +308,10 @@ extern unsigned int kobjsize(const void *objp); # define VM_SOFTDIRTY 0 #endif +#ifdef CONFIG_ETMEM +#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a077f60819d9e00cc715e406c933725b8b4f87d3..80858ee05ee457d716ee159426bad7cca7710187 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,6 +31,7 @@ struct address_space; struct mem_cgroup; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -940,6 +941,9 @@ struct mm_struct { #ifdef CONFIG_MEMORY_RELIABLE /* total used reliable pages */ atomic_long_t reliable_nr_page; +#endif +#if IS_ENABLED(CONFIG_ETMEM) && IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; #endif } __randomize_layout; diff --git a/include/linux/swap.h b/include/linux/swap.h index 9dc160d6fd43b892a86516d008c1f72474108422..5ac12a9634694e08ce92f09d305d39ab6363f578 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -416,6 +416,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long reclaim_pages(struct list_head *folio_list); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432c64599f706b86e74a12581c2a54e..14e5498efd7acab203c0d43e48e0536ed52ffead 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,10 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_ETMEM_BASE 0x1100 +#define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/Kconfig b/mm/Kconfig index 82dbe6c28fcb3501dd678f47ac72e21d8af96d5c..f45bc11c1cb2987449f3311a3e5c2fee27c9f910 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1374,6 +1374,39 @@ config DYNAMIC_POOL A per-memcg pagepool. The task in the memcg will prefer to alloc pages from corresponding pool. +config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" + depends on ETMEM + help + etmem scan is a critical part of the etmem feature. + A kernel module, etmem_scan.ko periodically scans the appointed vma + segments of the target process, perform page table walk accordingly, + check and clear the access bit of each page before finally reporting + scan results back to user space. + etmem scan also supports virtual machine. + +config ETMEM_SWAP + tristate "module: etmem page swap for etmem support" + depends on ETMEM + help + etmem swap is a critical component of the etmem feature. + When using etmem slide engine, etmem_swap.ko will add appointed pages + (ideally all of which are all rarely used, "cold" pages) to swapcache + proactively, which will later be reclaimed and added to swap space, + making room for more frequently used, "hot" pages. + +config ETMEM + bool "Enable etmem feature" + depends on MMU + depends on X86 || ARM64 + default n + help + etmem is a tiered memory extension technology that uses DRAM and memory + compression/high-performance storage media to form tiered memory storage. + Memory data is tiered, and cold data is migrated from memory media to + high-performance storage media to release memory space and reduce + memory costs. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 8d7d2aeda6eab22e6fc43a147466444a299fb158..11df2de8fdbe9d5a70e4ca3a73db68ebd9c9d331 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -142,6 +142,7 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_ETMEM) += etmem.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o diff --git a/mm/etmem.c b/mm/etmem.c new file mode 100644 index 0000000000000000000000000000000000000000..acd32e71a64311a70c09ce10a9641476612585ef --- /dev/null +++ b/mm/etmem.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static bool enable_kernel_swap __read_mostly = true; + +bool kernel_swap_enabled(void) +{ + return READ_ONCE(enable_kernel_swap); +} + +static ssize_t kernel_swap_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", enable_kernel_swap ? "true" : "false"); +} + +static ssize_t kernel_swap_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + WRITE_ONCE(enable_kernel_swap, true); + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + WRITE_ONCE(enable_kernel_swap, false); + else + return -EINVAL; + + return count; +} + +struct kobj_attribute kernel_swap_enable_attr = + __ATTR(kernel_swap_enable, 0644, kernel_swap_enable_show, + kernel_swap_enable_store); + +int add_page_for_swap(struct page *page, struct list_head *pagelist) +{ + int err = -EBUSY; + struct page *head; + + /* If the page is mapped by more than one process, do not swap it */ + if (page_mapcount(page) > 1) + return -EACCES; + + if (PageHuge(page)) + return -EACCES; + + head = compound_head(page); + if (!folio_isolate_lru(page_folio(head))) { + put_page(page); + return err; + } + put_page(page); + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add_tail(&head->lru, pagelist); + + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(add_page_for_swap); + +struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + struct vm_area_struct *vma; + unsigned int follflags; + + mmap_read_lock(mm); + + vma = find_vma(mm, vaddr); + if (!vma || vaddr < vma->vm_start || vma->vm_flags & VM_LOCKED) { + mmap_read_unlock(mm); + return NULL; + } + + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, vaddr, follflags); + if (IS_ERR(page) || !page) { + mmap_read_unlock(mm); + return NULL; + } + + mmap_read_unlock(mm); + return page; +} +EXPORT_SYMBOL_GPL(get_page_from_vaddr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 65421d751a9d9456fafc52ec347ac2d130616833..27fa3d3a08af1aa3cf51723a392af0a20cd52ed6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1966,6 +1966,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spin_unlock(ptl); return NULL; } +EXPORT_SYMBOL_GPL(__pud_trans_huge_lock); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index f4416fcbae782bfe1b256c07b523326e87ff3275..a266a08e0831e1df82aa4ec539ae1c84b2305181 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -810,7 +810,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ diff --git a/mm/madvise.c b/mm/madvise.c index 2d56815daff246c2c43494fb940a0249f21a7a46..2a3035df3a098f2c01d6366280c030cac5c3aa98 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1113,6 +1113,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; +#endif case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); } @@ -1217,6 +1225,10 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: +#endif +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: #endif return true; @@ -1407,6 +1419,10 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. + * MADV_SWAPFLAG - Used in the etmem memory extension feature, the process + * specifies the memory swap area by adding a flag to a specific + * vma address. + * MADV_SWAPFLAG_REMOVE - remove the specific vma flag * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b7d7e4fcfad7aa5c4ce2af9dc036d0169769c72f..144c9fed0fc9461ca157faab22640c5457356fcb 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -525,6 +525,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +EXPORT_SYMBOL_GPL(walk_page_range); /** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/mm/swap_state.c b/mm/swap_state.c index b3b14bd0dd6447f47aea2df42e8348f15660c73a..ddb3a65e5c6e87ae032f86d71c77f16b9a746570 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" #include "swap.h" @@ -881,6 +882,9 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, +#ifdef CONFIG_ETMEM + &kernel_swap_enable_attr.attr, +#endif NULL, }; diff --git a/mm/vmscan.c b/mm/vmscan.c index d8cd8ffa0119774b1b13aeea966445554246d685..b36d839e972acd388d14994d59f0849b3dad1032 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -2844,6 +2845,7 @@ unsigned long reclaim_pages(struct list_head *folio_list) return nr_reclaimed; } +EXPORT_SYMBOL_GPL(reclaim_pages); static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) @@ -3044,6 +3046,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + if (sc->may_swap && !kernel_swap_enabled()) + sc->may_swap = 0; + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; @@ -3311,6 +3316,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + if (sc->may_swap && !kernel_swap_enabled()) + return 0; + if (!sc->may_swap) return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 486800a7024b373f167f12b48ce1f13b095e6098..bb5f6cf4da3394f3c980dc946d0715649674f18e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1305,6 +1305,10 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_ETMEM) && IS_ENABLED(CONFIG_KVM) + if (mm->kvm == kvm) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5098,6 +5102,10 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) goto put_kvm; } +#if IS_ENABLED(CONFIG_ETMEM) && IS_ENABLED(CONFIG_KVM) + if (kvm->mm->kvm == NULL) + kvm->mm->kvm = kvm; +#endif /* * Don't call kvm_put_kvm anymore at this point; file->f_op is * already set, with ->release() being kvm_vm_release(). In error