From 444ec524e2ce098211d612a2808768bafdea7175 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:13 +0800 Subject: [PATCH 01/14] userswap: introduce UFFDIO_COPY_MODE_DIRECT_MAP to map without copying hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Add a new UFFDIO_COPY mode UFFDIO_COPY_MODE_DIRECT_MAP to map physical pages without copy_from_user(). We introduce uswap_unmap_anon_page() to unmap an anonymous page and uswap_map_anon_page() to map page to src addr. We also introduce mfill_atomic_pte_nocopy() to achieve zero copy by unmapping src_addr to the physical page and establishing the mapping from dst_addr to the physical page. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 9 +- include/linux/userfaultfd_k.h | 3 - include/linux/userswap.h | 28 +++++++ include/uapi/linux/userfaultfd.h | 1 + mm/Makefile | 1 + mm/mmap.c | 1 + mm/userfaultfd.c | 42 ++++------ mm/userswap.c | 140 +++++++++++++++++++++++++++++++ 8 files changed, 191 insertions(+), 34 deletions(-) create mode 100644 include/linux/userswap.h create mode 100644 mm/userswap.c diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7d0022f82844..ef51ed87ef38 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -27,13 +27,11 @@ #include #include #include +#include int sysctl_unprivileged_userfaultfd __read_mostly = 1; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -#ifdef CONFIG_USERSWAP -int enable_userswap; -#endif /* * Start with fault_pending_wqh and fault_wqh so they're more likely @@ -1717,7 +1715,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE | + UFFDIO_COPY_MODE_WP | + IS_ENABLED(CONFIG_USERSWAP) ? + UFFDIO_COPY_MODE_DIRECT_MAP : 0)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e1cacab86bde..e91f31a4c830 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,9 +31,6 @@ #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) extern int sysctl_unprivileged_userfaultfd; -#ifdef CONFIG_USERSWAP -extern int enable_userswap; -#endif extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); diff --git a/include/linux/userswap.h b/include/linux/userswap.h new file mode 100644 index 000000000000..fe2c868851fb --- /dev/null +++ b/include/linux/userswap.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef _LINUX_USERSWAP_H +#define _LINUX_USERSWAP_H + +#ifdef CONFIG_USERSWAP + +extern int enable_userswap; + +int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr); + +static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) +{ + if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + return false; + return true; +} + +#endif /* CONFIG_USERSWAP */ + +#endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index ada058f8b94b..4de57e12cdff 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -217,6 +217,7 @@ struct uffdio_copy { * according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10) __u64 mode; /* diff --git a/mm/Makefile b/mm/Makefile index 696ee59c2ac7..a014a5e08f7b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_USERSWAP) += userswap.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/mmap.c b/mm/mmap.c index 9b6fcf8c2f1d..a5867d039153 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 15c46208a2ac..b66abbba13ef 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "internal.h" @@ -90,10 +91,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } -#ifdef CONFIG_USERSWAP - if (dst_vma->vm_flags & VM_USWAP) - ClearPageDirty(page); -#endif /* * The memory barrier inside __SetPageUptodate makes sure that * preceding stores to the page contents become visible before @@ -112,10 +109,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, else _dst_pte = pte_mkwrite(_dst_pte); } -#ifdef CONFIG_USERSWAP - if (dst_vma->vm_flags & VM_USWAP) - _dst_pte = pte_mkclean(_dst_pte); -#endif dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -128,26 +121,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; } -#ifdef CONFIG_USERSWAP - if (!(dst_vma->vm_flags & VM_USWAP)) { - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; - } else { - /* - * The userspace may swap in a large area. Part of the area is - * not swapped out. Skip those pages. - */ - ret = 0; - if (swp_type(pte_to_swp_entry(*dst_pte)) != SWP_USERSWAP_ENTRY || - pte_present(*dst_pte)) - goto out_release_uncharge_unlock; - } -#else ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; -#endif inc_mm_counter(dst_mm, MM_ANONPAGES); reliable_page_counter(page, dst_mm, 1); @@ -535,6 +511,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, goto out_unlock; err = -EINVAL; +#ifdef CONFIG_USERSWAP + if (!uswap_check_copy_mode(dst_vma, mode)) + goto out_unlock; +#endif /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. @@ -605,8 +585,16 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); +#ifdef CONFIG_USERSWAP + if (dst_vma->vm_flags & VM_USWAP && + mode & UFFDIO_COPY_MODE_DIRECT_MAP) + err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr); + else +#endif + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page, + zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { diff --git a/mm/userswap.c b/mm/userswap.c new file mode 100644 index 000000000000..fe33fda975d1 --- /dev/null +++ b/mm/userswap.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * userswap core file include swap-in and swap-out core function + */ + +#include +#include +#include +#include + +#include "internal.h" + +int enable_userswap; + +static void uswap_unmap_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, struct page *page, + pmd_t *pmd, pte_t *old_pte) +{ + struct mmu_notifier_range range; + spinlock_t *ptl; + pte_t *pte; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + vma->vm_mm, addr, addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (pte_none(*pte)) + goto out_release_unlock; + flush_cache_page(vma, addr, pte_pfn(*pte)); + *old_pte = ptep_clear_flush(vma, addr, pte); + + dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); + page_remove_rmap(page, false); + +out_release_unlock: + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + page->mapping = NULL; +} + +static void uswap_map_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + pmd_t *pmd, + pte_t old_pte) +{ + spinlock_t *ptl; + pte_t *pte; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + set_pte_at(mm, addr, pte, old_pte); + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, vma, addr, false); + pte_unmap_unlock(pte, ptl); +} + +int mfill_atomic_pte_nocopy(struct mm_struct *mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr) +{ + struct vm_area_struct *src_vma; + pte_t dst_pte, *pte, src_pte; + pmd_t *src_pmd; + spinlock_t *ptl; + int ret = 0; + struct page *page; + + src_vma = find_vma(mm, src_addr); + if (!src_vma || src_addr < src_vma->vm_start) + return -ENOENT; + + if (src_vma->vm_flags & VM_LOCKED) + return -EINVAL; + + page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_MIGRATION); + if (!page) + return -ENODEV; + + src_pmd = mm_find_pmd(mm, src_addr); + if (!src_pmd) { + ret = -ENXIO; + goto out_put_page; + } + uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte); + + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite(pte_mkdirty(dst_pte)); + if (dst_vma->vm_flags & VM_USWAP) + dst_pte = pte_mkclean(dst_pte); + + pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl); + + /* + * The userspace may swap in a large area. Part of the area is not + * swapped out. If concurrent execution, PTE may be present. Skip those + * pages (pte_present). + * No other scenes should be handled except first pagefault (pte_none) + * and after userswap out (SWP_USERSWAP_ENTRY). + */ + if (pte_present(*pte) || (!pte_none(*pte) && + swp_type(pte_to_swp_entry(*pte)) != SWP_USERSWAP_ENTRY)) { + pte_unmap_unlock(pte, ptl); + uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd, + src_pte); + ret = -EEXIST; + goto out_put_page; + } + + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + set_pte_at(mm, dst_addr, pte, dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, pte); + pte_unmap_unlock(pte, ptl); + +out_put_page: + put_page(page); + return ret; +} -- Gitee From c97cdd7e9d59278928e3312d5c43a7ffca2e2570 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:14 +0800 Subject: [PATCH 02/14] userswap: introduce MREMAP_USWAP_SET_PTE to remap for swapping out hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to 'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is set to SWP_USERSWAP_ENTRY. Signed-off-by: ZhangPeng --- include/linux/userswap.h | 24 ++ include/uapi/asm-generic/mman-common.h | 2 - include/uapi/linux/mman.h | 1 + mm/mmap.c | 205 ---------------- mm/mremap.c | 11 + mm/userswap.c | 324 ++++++++++++++++++++++++- 6 files changed, 358 insertions(+), 209 deletions(-) diff --git a/include/linux/userswap.h b/include/linux/userswap.h index fe2c868851fb..82cc79584e43 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -6,16 +6,28 @@ #ifndef _LINUX_USERSWAP_H #define _LINUX_USERSWAP_H +#include +#include + #ifdef CONFIG_USERSWAP extern int enable_userswap; +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr); +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) { if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP)) @@ -23,6 +35,18 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) return true; } +static inline bool uswap_validate_mremap_flags(unsigned long flags) +{ + if (!enable_userswap && flags & MREMAP_USWAP_SET_PTE) + return false; + if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE) + return false; + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP | + MREMAP_USWAP_SET_PTE)) + return false; + return true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 898ea134b2f3..66c408ccc6c6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,8 +30,6 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ -#define MAP_REPLACE 0x1000000 - #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0..174a1a2eb041 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -8,6 +8,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64 #define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/mmap.c b/mm/mmap.c index a5867d039153..d6f51da7aad8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include @@ -1623,205 +1622,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, { return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); } -#ifdef CONFIG_USERSWAP -/* - * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get - * the reference of the pages and return the pages through input parameters - * 'ppages'. - */ -static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr, - unsigned long len, struct page ***ppages) -{ - struct vm_area_struct *vma; - struct page *page = NULL; - struct page **pages = NULL; - unsigned long addr_end = addr + len; - unsigned long ret; - int i, page_num = 0; - - pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); - if (!pages) - return -ENOMEM; - - while (addr < addr_end) { - vma = find_vma(mm, addr); - if (!vma || !vma_is_anonymous(vma) || vma->vm_file || - (vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) { - ret = -EINVAL; - goto out; - } - if (!(vma->vm_flags & VM_UFFD_MISSING)) { - ret = -EAGAIN; - goto out; - } -get_again: - /* follow_page will inc page ref, dec the ref after we remap the page */ - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) { - ret = -ENODEV; - goto out; - } - pages[page_num++] = page; - if (!PageAnon(page) || !PageSwapBacked(page) || - PageHuge(page) || PageSwapCache(page)) { - ret = -EINVAL; - goto out; - } else if (PageTransCompound(page)) { - if (trylock_page(page)) { - if (!split_huge_page(page)) { - put_page(page); - page_num--; - unlock_page(page); - goto get_again; - } else { - unlock_page(page); - ret = -EINVAL; - goto out; - } - } else { - ret = -EINVAL; - goto out; - } - } - if (page_mapcount(page) > 1 || - page_mapcount(page) + 1 != page_count(page)) { - ret = -EBUSY; - goto out; - } - addr += PAGE_SIZE; - } - - *ppages = pages; - return 0; - -out: - for (i = 0; i < page_num; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - *ppages = NULL; - return ret; -} - -/* - * In uswap situation, we use the bit 0 of the returned address to indicate - * whether the pages are dirty. - */ -#define USWAP_PAGES_DIRTY 1 - -/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ -static unsigned long -do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, - struct page **pages, unsigned long new_addr) -{ - struct vm_area_struct *vma; - struct page *page; - struct mmu_notifier_range range; - pmd_t *pmd; - pte_t *pte, old_pte; - spinlock_t *ptl; - unsigned long addr; - bool pages_dirty = false; - int i = 0; - - addr = addr_start; - lru_add_drain(); - i = 0; - while (addr < addr_start + len) { - page = pages[i]; - vma = find_vma(mm, addr); - if (!vma) - return -EINVAL; - - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, - vma->vm_mm, addr, addr + PAGE_SIZE); - mmu_notifier_invalidate_range_start(&range); - pmd = mm_find_pmd(mm, addr); - if (!pmd) { - mmu_notifier_invalidate_range_end(&range); - return -ENXIO; - } - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - flush_cache_page(vma, addr, pte_pfn(*pte)); - old_pte = ptep_clear_flush(vma, addr, pte); - if (pte_dirty(old_pte) || PageDirty(page)) - pages_dirty = true; - set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, - page_to_pfn(page)))); - dec_mm_counter(mm, MM_ANONPAGES); - reliable_page_counter(page, mm, -1); - page_remove_rmap(page, false); - put_page(page); - - pte_unmap_unlock(pte, ptl); - mmu_notifier_invalidate_range_end(&range); - vma->vm_flags |= VM_USWAP; - page->mapping = NULL; - addr += PAGE_SIZE; - i++; - } - - addr = new_addr; - vma = find_vma(mm, addr); - i = 0; - while (addr < new_addr + len) { - if (addr > vma->vm_end - 1) - vma = find_vma(mm, addr); - if (!vma) - return -ENODEV; - - page = pages[i++]; - if (vm_insert_page(vma, addr, page)) - return -EFAULT; - - addr += PAGE_SIZE; - } - vma->vm_flags |= VM_USWAP; - - if (pages_dirty) - new_addr = new_addr | USWAP_PAGES_DIRTY; - - return new_addr; -} - -static inline unsigned long -do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long pgoff, - unsigned long *populate, struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - unsigned long old_addr = addr; - struct page **pages = NULL; - unsigned long ret; - int i; - - if (!len || offset_in_page(addr) || (len % PAGE_SIZE)) - return -EINVAL; - - ret = pages_can_be_swapped(mm, addr, len, &pages); - if (ret) - return ret; - - /* mark the vma as special to avoid merging with other vmas */ - addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff, - populate, uf); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto out; - } - - ret = do_user_swap(mm, old_addr, len, pages, addr); -out: - /* follow_page() above increased the reference*/ - for (i = 0; i < len / PAGE_SIZE; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - - return ret; -} -#endif /* * The caller must write-lock current->mm->mmap_lock. @@ -1831,11 +1631,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { -#ifdef CONFIG_USERSWAP - if (enable_userswap && (flags & MAP_REPLACE)) - return do_uswap_mmap(file, addr, len, prot, flags, pgoff, - populate, uf); -#endif return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); } diff --git a/mm/mremap.c b/mm/mremap.c index 2f7f3494a990..b8b694be40bd 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr); +#ifdef CONFIG_USERSWAP + if (!uswap_validate_mremap_flags(flags)) + return ret; +#else if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret; +#endif if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; @@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; +#ifdef CONFIG_USERSWAP + if (flags & MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (mmap_write_lock_killable(current->mm)) return -EINTR; diff --git a/mm/userswap.c b/mm/userswap.c index fe33fda975d1..dd212b1a02e6 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -9,15 +9,155 @@ #include #include #include +#include #include "internal.h" int enable_userswap; +static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !(vma->vm_flags & VM_USWAP) || !vma_is_anonymous(vma) || + vma->vm_file || vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | + VM_IO | VM_PFNMAP)) + return false; + return true; +} + +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + return pud; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + int i, page_num = 0; + *ppages = NULL; + + + pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else + unlock_page(page); + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kfree(pages); + return ret; +} + static void uswap_unmap_anon_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, - pmd_t *pmd, pte_t *old_pte) + pmd_t *pmd, pte_t *old_pte, + bool set_to_swp) { struct mmu_notifier_range range; spinlock_t *ptl; @@ -31,6 +171,9 @@ static void uswap_unmap_anon_page(struct mm_struct *mm, goto out_release_unlock; flush_cache_page(vma, addr, pte_pfn(*pte)); *old_pte = ptep_clear_flush(vma, addr, pte); + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); reliable_page_counter(page, mm, -1); @@ -61,6 +204,182 @@ static void uswap_map_anon_page(struct mm_struct *mm, pte_unmap_unlock(pte, ptl); } +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, vma, addr, false); + set_pte_at(mm, addr, pte, mk_pte(page, vma->vm_page_prot)); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + int i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + int i = 0, j; + int ret; + + ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + memset(ptes, 0, sizeof(pte_t) * (len / PAGE_SIZE)); + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + + ret = -EACCES; + if (pgprot_val(old_vma->vm_page_prot) != + pgprot_val(new_vma->vm_page_prot)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr = new_addr | USWAP_PAGES_DIRTY; + kfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~addr+old_len' and remap them to 'new_addr + * ~ new_addr+new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + int i; + + if (!len || old_len != new_len || offset_in_page(old_addr) || + (len % PAGE_SIZE)) + return ret; + + down_read(&mm->mmap_lock); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) { + up_read(&mm->mmap_lock); + return ret; + } + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + up_read(&mm->mmap_lock); + /* follow_page() above increased the reference*/ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + kfree(pages); + return ret; +} + int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -90,7 +409,8 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, ret = -ENXIO; goto out_put_page; } - uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte); + uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte, + false); if (dst_vma->vm_flags & VM_USWAP) ClearPageDirty(page); -- Gitee From 010932f53f5ccd3084552e34e3f909a231af5947 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:15 +0800 Subject: [PATCH 03/14] userswap: fix VM_BUG_ON() in handle_userfault() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- When CONFIG_VM_BUG_ON=y and userswap feature is used, there is a kernel BUG in handle_userfault(). VM_BUG_ON() didn't allow more than one reason flag. Fix this by skipping VM_BUG_ON() if reason is VM_UFFD_MISSING|VM_USWAP. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 4 ++++ include/linux/userswap.h | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ef51ed87ef38..4b20f0fd949e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -406,8 +406,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm); +#ifdef CONFIG_USERSWAP + VM_BUG_ON(uswap_vm_flag_bug_on(reason)); +#else VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); +#endif if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 82cc79584e43..6c96cef2ec9b 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -47,6 +47,18 @@ static inline bool uswap_validate_mremap_flags(unsigned long flags) return true; } +/* When CONFIG_USERSWAP=y, VM_UFFD_MISSING|VM_USWAP is right; + * 0 or > 1 flags set is a bug; we expect exactly 1. + */ +static inline bool uswap_vm_flag_bug_on(unsigned long reason) +{ + if (reason & ~(VM_UFFD_MISSING | VM_UFFD_WP | VM_USWAP)) + return true; + if (reason & VM_USWAP) + return !(reason & VM_UFFD_MISSING) || reason & ~(VM_USWAP|VM_UFFD_MISSING); + return !(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP); +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ -- Gitee From ef3a1632f95a077928a51b2f9aae7119d1b9e1a0 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:16 +0800 Subject: [PATCH 04/14] userswap: introduce new flag to determine the first page fault hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Introduce new flag to determine the first page fault. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 4 ++++ include/linux/userswap.h | 7 +++++++ include/uapi/linux/userfaultfd.h | 1 + mm/memory.c | 14 ++++++++++++++ 4 files changed, 26 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4b20f0fd949e..8a916c96d967 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -485,6 +485,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, ctx->features); +#ifdef CONFIG_USERSWAP + if (reason & VM_USWAP && pte_none(vmf->orig_pte)) + uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FPF; +#endif uwq.ctx = ctx; uwq.waken = false; diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 6c96cef2ec9b..29a3bead95d0 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -59,6 +59,13 @@ static inline bool uswap_vm_flag_bug_on(unsigned long reason) return !(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP); } +static inline bool uswap_missing(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_USWAP && vma->vm_flags & VM_UFFD_MISSING) + return true; + return false; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 4de57e12cdff..b8689050455d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -127,6 +127,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_FPF (1<<10) /* If this was the first page fault */ struct uffdio_api { /* userland asks for an API number and the features to enable */ diff --git a/mm/memory.c b/mm/memory.c index 8f7d4531c763..ed017586db1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -3689,6 +3690,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ +#ifdef CONFIG_USERSWAP + if (uswap_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING|VM_USWAP); + } +#endif if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); @@ -3731,6 +3738,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) goto release; /* Deliver the page fault to userland, check inside PT lock */ +#ifdef CONFIG_USERSWAP + if (uswap_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(page); + return handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + } +#endif if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(page); -- Gitee From 2ca987f14a442c90d814ce26a779eeb4e36b3fa2 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:17 +0800 Subject: [PATCH 05/14] userswap: provide cpu info in userfault msg hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- The uffd_msg.reserved3 field is used to transfer the CPU information of the PF. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 3 +++ include/linux/userswap.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8a916c96d967..d27b66eb5aa0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -218,6 +218,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); +#ifdef CONFIG_USERSWAP + uswap_get_cpu_id(reason, &msg); +#endif return msg; } diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 29a3bead95d0..9d1d0ee2e9d7 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -66,6 +66,12 @@ static inline bool uswap_missing(struct vm_area_struct *vma) return false; } +static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) +{ + if (reason & VM_USWAP) + msg->reserved3 = smp_processor_id(); +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ -- Gitee From 4a55c5b44b20c0504c443c69f49f2e173c1ed031 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:18 +0800 Subject: [PATCH 06/14] userswap: move userswap feature code into mm/userswap.c hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- This patch moves the code related to enable_userswap and CONFIG_USERSWAP to mm/userswap.c. This allows for better encapsulation and easier maintenance. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 37 +++-------------------- include/linux/userswap.h | 18 +++++++++++ mm/memory.c | 16 +--------- mm/userswap.c | 65 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 48 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d27b66eb5aa0..1222e507fade 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,8 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * changes under us. */ #ifdef CONFIG_USERSWAP - if ((reason & VM_USWAP) && (!pte_present(*pte))) - ret = true; + uswap_must_wait(reason, *pte, &ret); #endif if (pte_none(*pte)) ret = true; @@ -875,8 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) for (vma = mm->mmap; vma; vma = vma->vm_next) { userfault_flags = VM_UFFD_MISSING | VM_UFFD_WP; #ifdef CONFIG_USERSWAP - if (enable_userswap) - userfault_flags |= VM_USWAP; + uswap_release(&userfault_flags); #endif cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ @@ -1297,26 +1295,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; vm_flags = 0; #ifdef CONFIG_USERSWAP - /* - * register the whole vma overlapping with the address range to avoid - * splitting the vma. - */ - if (enable_userswap && (uffdio_register.mode & UFFDIO_REGISTER_MODE_USWAP)) { - uffdio_register.mode &= ~UFFDIO_REGISTER_MODE_USWAP; - if (!uffdio_register.mode) - goto out; - vm_flags |= VM_USWAP; - end = uffdio_register.range.start + uffdio_register.range.len - 1; - vma = find_vma(mm, uffdio_register.range.start); - if (!vma) - goto out; - uffdio_register.range.start = vma->vm_start; - - vma = find_vma(mm, end); - if (!vma) - goto out; - uffdio_register.range.len = vma->vm_end - uffdio_register.range.start; - } + if (!uswap_register(&uffdio_register, &vm_flags, mm)) + goto out; #endif if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| UFFDIO_REGISTER_MODE_WP)) @@ -2041,15 +2021,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } -#ifdef CONFIG_USERSWAP -static int __init enable_userswap_setup(char *str) -{ - enable_userswap = true; - return 1; -} -__setup("enable_userswap", enable_userswap_setup); -#endif - static int __init userfaultfd_init(void) { userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 9d1d0ee2e9d7..14826e3d9c2c 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -28,6 +28,12 @@ int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len); +bool uswap_register(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); + +bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma, vm_fault_t *ret); + static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) { if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP)) @@ -72,6 +78,18 @@ static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) msg->reserved3 = smp_processor_id(); } +static inline void uswap_release(unsigned long *userfault_flags) +{ + if (enable_userswap) + *userfault_flags |= VM_USWAP; +} + +static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) +{ + if ((reason & VM_USWAP) && (!pte_present(pte))) + *ret = true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/mm/memory.c b/mm/memory.c index ed017586db1f..5941a4f4ea4b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3396,22 +3396,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) entry = pte_to_swp_entry(vmf->orig_pte); #ifdef CONFIG_USERSWAP - if (swp_type(entry) == SWP_USERSWAP_ENTRY) { - /* print error if we come across a nested fault */ - if (!strncmp(current->comm, "uswap", 5)) { - pr_err("USWAP: fault %lx is triggered by %s\n", - vmf->address, current->comm); - return VM_FAULT_SIGBUS; - } - if (!(vma->vm_flags & VM_UFFD_MISSING)) { - pr_err("USWAP: addr %lx flags %lx is not a user swap page", - vmf->address, vma->vm_flags); - goto skip_uswap; - } - ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + if (!do_uswap_page(entry, vmf, vma, &ret)) return ret; - } -skip_uswap: #endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { diff --git a/mm/userswap.c b/mm/userswap.c index dd212b1a02e6..384548db9ba2 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" @@ -458,3 +459,67 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, put_page(page); return ret; } + +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma. + */ +bool uswap_register(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long end; + + if (!enable_userswap) + return true; + if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) + return true; + uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; + if (!uffdio_register->mode) + return false; + + end = uffdio_register->range.start + uffdio_register->range.len - 1; + vma = find_vma(mm, uffdio_register->range.start); + if (!vma) + return false; + uffdio_register->range.start = vma->vm_start; + vma = find_vma(mm, end); + if (!vma) + return false; + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + + *vm_flags |= VM_USWAP; + + return true; +} + +bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma, vm_fault_t *ret) +{ + if (swp_type(entry) != SWP_USERSWAP_ENTRY) + return true; + + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, "uswap", 5)) { + pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address, + current->comm); + *ret = VM_FAULT_SIGBUS; + return false; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + return true; + } + + *ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + return false; +} + +static int __init enable_userswap_setup(char *str) +{ + enable_userswap = true; + return 1; +} +__setup("enable_userswap", enable_userswap_setup); -- Gitee From cbf06b7dfc1d1dd9c4f4fad906dbacaf0cdbb1e4 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:19 +0800 Subject: [PATCH 07/14] userswap: convert enable_userswap to static key hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Replace enable_userswap with struct static_key_false userswap_enabled. Signed-off-by: ZhangPeng --- include/linux/userswap.h | 6 +++--- mm/userfaultfd.c | 3 ++- mm/userswap.c | 9 ++++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 14826e3d9c2c..94d773364618 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -11,7 +11,7 @@ #ifdef CONFIG_USERSWAP -extern int enable_userswap; +extern struct static_key_false userswap_enabled; /* * In uswap situation, we use the bit 0 of the returned address to indicate @@ -43,7 +43,7 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) static inline bool uswap_validate_mremap_flags(unsigned long flags) { - if (!enable_userswap && flags & MREMAP_USWAP_SET_PTE) + if (!static_branch_unlikely(&userswap_enabled) && flags & MREMAP_USWAP_SET_PTE) return false; if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE) return false; @@ -80,7 +80,7 @@ static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) static inline void uswap_release(unsigned long *userfault_flags) { - if (enable_userswap) + if (static_branch_unlikely(&userswap_enabled)) *userfault_flags |= VM_USWAP; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b66abbba13ef..a5de7990f7ff 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -586,7 +586,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd)); #ifdef CONFIG_USERSWAP - if (dst_vma->vm_flags & VM_USWAP && + if (static_branch_unlikely(&userswap_enabled) && + dst_vma->vm_flags & VM_USWAP && mode & UFFDIO_COPY_MODE_DIRECT_MAP) err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr); diff --git a/mm/userswap.c b/mm/userswap.c index 384548db9ba2..85137803dfb1 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -14,7 +14,7 @@ #include "internal.h" -int enable_userswap; +DEFINE_STATIC_KEY_FALSE(userswap_enabled); static bool vma_uswap_compatible(struct vm_area_struct *vma) { @@ -470,7 +470,7 @@ bool uswap_register(struct uffdio_register *uffdio_register, struct vm_area_struct *vma; unsigned long end; - if (!enable_userswap) + if (!static_branch_unlikely(&userswap_enabled)) return true; if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) return true; @@ -496,6 +496,9 @@ bool uswap_register(struct uffdio_register *uffdio_register, bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma, vm_fault_t *ret) { + if (!static_branch_unlikely(&userswap_enabled)) + return true; + if (swp_type(entry) != SWP_USERSWAP_ENTRY) return true; @@ -519,7 +522,7 @@ bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, static int __init enable_userswap_setup(char *str) { - enable_userswap = true; + static_branch_enable(&userswap_enabled); return 1; } __setup("enable_userswap", enable_userswap_setup); -- Gitee From 2e04865a3e30cdfd1ce439ced5a292eccb138c74 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:20 +0800 Subject: [PATCH 08/14] userswap: fix NULL pointer dereference in uswap_unmap_anon_page() hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- If old_pte is NULL, *old_pte will result in a null pointer dereference. Fix this by adding a NULL check for old_pte. Signed-off-by: ZhangPeng --- mm/userswap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/userswap.c b/mm/userswap.c index 85137803dfb1..9e2d063952ff 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -162,7 +162,7 @@ static void uswap_unmap_anon_page(struct mm_struct *mm, { struct mmu_notifier_range range; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, _old_pte; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, addr, addr + PAGE_SIZE); @@ -171,7 +171,7 @@ static void uswap_unmap_anon_page(struct mm_struct *mm, if (pte_none(*pte)) goto out_release_unlock; flush_cache_page(vma, addr, pte_pfn(*pte)); - *old_pte = ptep_clear_flush(vma, addr, pte); + _old_pte = ptep_clear_flush(vma, addr, pte); if (set_to_swp) set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( SWP_USERSWAP_ENTRY, page_to_pfn(page)))); @@ -184,6 +184,8 @@ static void uswap_unmap_anon_page(struct mm_struct *mm, pte_unmap_unlock(pte, ptl); mmu_notifier_invalidate_range_end(&range); page->mapping = NULL; + if (old_pte) + *old_pte = _old_pte; } static void uswap_map_anon_page(struct mm_struct *mm, -- Gitee From 8c50966532f3ffbcba44c8003cf40a2cc86be1e9 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:21 +0800 Subject: [PATCH 09/14] userswap: split uswap_register() to validate address ranges hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Split uswap_register() into uswap_register() and uswap_adjust_uffd_range(). Before validate_range(), use uswap_register() to handle uswap mode. After validate_range(), use uswap_adjust_uffd_range() to change address range to VMA range, which could reduce fragmentation caused by VMA splitting. By splitting uswap_register(), we could prevent the userswap registration of invalid input address ranges. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 14 ++++++++++++-- include/linux/userswap.h | 6 ++++-- mm/userswap.c | 26 +++++++++++++++++--------- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1222e507fade..ee20b9620ac2 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1282,6 +1282,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; +#ifdef CONFIG_USERSWAP + bool uswap_mode = false; +#endif user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1295,7 +1298,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; vm_flags = 0; #ifdef CONFIG_USERSWAP - if (!uswap_register(&uffdio_register, &vm_flags, mm)) + if (!uswap_register(&uffdio_register, &uswap_mode)) goto out; #endif if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| @@ -1310,7 +1313,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, uffdio_register.range.len); if (ret) goto out; - +#ifdef CONFIG_USERSWAP + if (uswap_mode) + if (!uswap_adjust_uffd_range(&uffdio_register, &vm_flags, + mm)) { + ret = -EINVAL; + goto out; + } +#endif start = uffdio_register.range.start; end = start + uffdio_register.range.len; diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 94d773364618..611f84bf3569 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -28,8 +28,10 @@ int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len); -bool uswap_register(struct uffdio_register *uffdio_register, - unsigned long *vm_flags, struct mm_struct *mm); +bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode); + +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma, vm_fault_t *ret); diff --git a/mm/userswap.c b/mm/userswap.c index 9e2d063952ff..ff98190eb258 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -462,16 +463,8 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, return ret; } -/* - * register the whole vma overlapping with the address range to avoid splitting - * the vma. - */ -bool uswap_register(struct uffdio_register *uffdio_register, - unsigned long *vm_flags, struct mm_struct *mm) +bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode) { - struct vm_area_struct *vma; - unsigned long end; - if (!static_branch_unlikely(&userswap_enabled)) return true; if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) @@ -479,7 +472,22 @@ bool uswap_register(struct uffdio_register *uffdio_register, uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; if (!uffdio_register->mode) return false; + *uswap_mode = true; + return true; +} +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma which could reduce fragmentation. + */ +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long end; + + if (!static_branch_unlikely(&userswap_enabled)) + return true; end = uffdio_register->range.start + uffdio_register->range.len - 1; vma = find_vma(mm, uffdio_register->range.start); if (!vma) -- Gitee From 74c0e7cdb8eea31453f14cb4a9b14f2aeaa3dcdc Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:22 +0800 Subject: [PATCH 10/14] userswap: fix some type and logical bugs hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- As follows, fix some type and logical bugs. 1) The type of index variable is changed from int to unsigned long to support large memory registration. 2) Fix the bug that USWAP_PAGES_DIRTY does not take effect. 3) Take the mmap_read_lock() when using the VMA in uswap_adjust_uffd_range(). 4) Do some code refactoring and cleancode. Signed-off-by: ZhangPeng --- fs/userfaultfd.c | 11 +++++------ include/linux/userswap.h | 22 ++++++++++++++-------- mm/userswap.c | 22 ++++++++++++++-------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ee20b9620ac2..c8ec0227f340 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1314,12 +1314,11 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (ret) goto out; #ifdef CONFIG_USERSWAP - if (uswap_mode) - if (!uswap_adjust_uffd_range(&uffdio_register, &vm_flags, - mm)) { - ret = -EINVAL; - goto out; - } + if (uswap_mode && !uswap_adjust_uffd_range(&uffdio_register, + &vm_flags, mm)) { + ret = -EINVAL; + goto out; + } #endif start = uffdio_register.range.start; end = start + uffdio_register.range.len; diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 611f84bf3569..a3b43e0b2176 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -45,13 +45,18 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) static inline bool uswap_validate_mremap_flags(unsigned long flags) { - if (!static_branch_unlikely(&userswap_enabled) && flags & MREMAP_USWAP_SET_PTE) - return false; - if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE) - return false; - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP | - MREMAP_USWAP_SET_PTE)) - return false; + if (static_branch_unlikely(&userswap_enabled)) { + if (flags & MREMAP_USWAP_SET_PTE && + flags & ~MREMAP_USWAP_SET_PTE) + return false; + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | + MREMAP_DONTUNMAP | MREMAP_USWAP_SET_PTE)) + return false; + } else { + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | + MREMAP_DONTUNMAP)) + return false; + } return true; } @@ -63,7 +68,8 @@ static inline bool uswap_vm_flag_bug_on(unsigned long reason) if (reason & ~(VM_UFFD_MISSING | VM_UFFD_WP | VM_USWAP)) return true; if (reason & VM_USWAP) - return !(reason & VM_UFFD_MISSING) || reason & ~(VM_USWAP|VM_UFFD_MISSING); + return !(reason & VM_UFFD_MISSING) || + reason & ~(VM_USWAP|VM_UFFD_MISSING); return !(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP); } diff --git a/mm/userswap.c b/mm/userswap.c index ff98190eb258..55fb88044298 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -82,7 +82,7 @@ static unsigned long pages_can_be_swapped(struct mm_struct *mm, struct page **pages = NULL; unsigned long addr_end = addr + len; unsigned long ret; - int i, page_num = 0; + unsigned long i, page_num = 0; *ppages = NULL; @@ -248,7 +248,7 @@ static void uswapout_recover(struct mm_struct *mm, struct page *page; pmd_t *old_pmd, *new_pmd; pte_t pte; - int i; + unsigned long i; for (i = 0; i < len; i++) { page = pages[i]; @@ -288,7 +288,7 @@ static unsigned long do_user_swap(struct mm_struct *mm, pmd_t *pmd; pte_t old_pte, *ptes; bool pages_dirty = false; - int i = 0, j; + unsigned long i = 0, j; int ret; ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); @@ -337,7 +337,7 @@ static unsigned long do_user_swap(struct mm_struct *mm, } if (pages_dirty) - new_addr = new_addr | USWAP_PAGES_DIRTY; + new_addr_start = new_addr_start | USWAP_PAGES_DIRTY; kfree(ptes); return new_addr_start; @@ -361,7 +361,7 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, struct mm_struct *mm = current->mm; unsigned long len = old_len; unsigned long ret = -EINVAL; - int i; + unsigned long i; if (!len || old_len != new_len || offset_in_page(old_addr) || (len % PAGE_SIZE)) @@ -485,22 +485,28 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, { struct vm_area_struct *vma; unsigned long end; + bool ret = false; if (!static_branch_unlikely(&userswap_enabled)) return true; end = uffdio_register->range.start + uffdio_register->range.len - 1; + + mmap_read_lock(mm); vma = find_vma(mm, uffdio_register->range.start); if (!vma) - return false; + goto out_unlock; uffdio_register->range.start = vma->vm_start; vma = find_vma(mm, end); if (!vma) - return false; + goto out_unlock; uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; *vm_flags |= VM_USWAP; - return true; + ret = true; +out_unlock: + mmap_read_unlock(mm); + return ret; } bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, -- Gitee From 037142189d0fe2d51afe31acc7af54e8592161ae Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:23 +0800 Subject: [PATCH 11/14] userswap: add checks for input addresses hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Add checks for new_addr in uswap_mremap() and src_addr in uswap_check_copy_mode(), including user mode checks, overlapping checks, etc. Signed-off-by: ZhangPeng --- include/linux/userswap.h | 18 +++++++++++++++--- mm/userfaultfd.c | 2 +- mm/userswap.c | 9 ++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/linux/userswap.h b/include/linux/userswap.h index a3b43e0b2176..43b419f9813f 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -36,10 +36,22 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma, vm_fault_t *ret); -static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) +static inline bool uswap_check_copy(struct vm_area_struct *vma, + unsigned long src_addr, + unsigned long len, __u64 mode) { - if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP)) - return false; + if (vma->vm_flags & VM_USWAP) { + if (!(mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + return false; + if (offset_in_page(src_addr)) + return false; + if (src_addr > TASK_SIZE || src_addr > TASK_SIZE - len) + return false; + } else { + if (mode & UFFDIO_COPY_MODE_DIRECT_MAP) + return false; + } + return true; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a5de7990f7ff..070359ee383a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -512,7 +512,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, err = -EINVAL; #ifdef CONFIG_USERSWAP - if (!uswap_check_copy_mode(dst_vma, mode)) + if (!uswap_check_copy(dst_vma, src_addr, len, mode)) goto out_unlock; #endif /* diff --git a/mm/userswap.c b/mm/userswap.c index 55fb88044298..3fcec18a354b 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -364,7 +364,14 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long i; if (!len || old_len != new_len || offset_in_page(old_addr) || - (len % PAGE_SIZE)) + offset_in_page(new_addr) || (len % PAGE_SIZE)) + return ret; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + return ret; + + /* Ensure the old/new locations do not overlap */ + if (old_addr + old_len > new_addr && new_addr + new_len > old_addr) return ret; down_read(&mm->mmap_lock); -- Gitee From 62d6e76aedc3e9bfcabd083801c8768d978634e0 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:24 +0800 Subject: [PATCH 12/14] userswap: add VMA checks for register address hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Add VMA checks for register address to make sure that register address has the corresponding VMA. Signed-off-by: ZhangPeng --- mm/userswap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/userswap.c b/mm/userswap.c index 3fcec18a354b..c9ac7e6389a7 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -500,13 +500,12 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, mmap_read_lock(mm); vma = find_vma(mm, uffdio_register->range.start); - if (!vma) + if (!vma || vma->vm_start >= end) goto out_unlock; uffdio_register->range.start = vma->vm_start; vma = find_vma(mm, end); - if (!vma) - goto out_unlock; - uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + if (vma && end >= vma->vm_start) + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; *vm_flags |= VM_USWAP; -- Gitee From 790b46fad269981b19467eca7ad1979ff45aa5af Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:25 +0800 Subject: [PATCH 13/14] userswap: check read and write permissions for swap-out pages hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Check the VM_READ and VM_WRITE flags of vma->vm_flags to determine whether the read and write permissions of the swap-out page VA are consistent with those of the swap-out buffer VA. If they are inconsistent, the swap operation will fail. Signed-off-by: ZhangPeng --- mm/userswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/userswap.c b/mm/userswap.c index c9ac7e6389a7..dcc608e0d8fe 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -311,8 +311,8 @@ static unsigned long do_user_swap(struct mm_struct *mm, goto out_recover; ret = -EACCES; - if (pgprot_val(old_vma->vm_page_prot) != - pgprot_val(new_vma->vm_page_prot)) + if (!(old_vma->vm_flags & VM_WRITE) && + (new_vma->vm_flags & VM_WRITE)) goto out_recover; ret = -ENXIO; -- Gitee From d042e6037ee42a6743f77ef51ba7a804722f4df7 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 18 May 2023 19:16:26 +0800 Subject: [PATCH 14/14] userswap: add user mode check for swap-out VA hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM -------------------------------- Add user mode check for swap-out VA to make sure that swap-out VA is user mode address. Signed-off-by: ZhangPeng --- mm/userswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/userswap.c b/mm/userswap.c index dcc608e0d8fe..2d47f6ed9f91 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -367,7 +367,8 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, offset_in_page(new_addr) || (len % PAGE_SIZE)) return ret; - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len || + old_addr > TASK_SIZE - old_len) return ret; /* Ensure the old/new locations do not overlap */ -- Gitee