diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d616b7777eef8ccaef7c8761c3473b1077488262..c371f4aa42b063ac1c01ba7ba4c4051a606f43a8 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -52,45 +52,6 @@ static struct ctl_table vm_userfaultfd_table[] = { static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -/* - * Start with fault_pending_wqh and fault_wqh so they're more likely - * to be in the same cacheline. - * - * Locking order: - * fd_wqh.lock - * fault_pending_wqh.lock - * fault_wqh.lock - * event_wqh.lock - * - * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, - * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's - * also taken in IRQ context. - */ -struct userfaultfd_ctx { - /* waitqueue head for the pending (i.e. not read) userfaults */ - wait_queue_head_t fault_pending_wqh; - /* waitqueue head for the userfaults */ - wait_queue_head_t fault_wqh; - /* waitqueue head for the pseudo fd to wakeup poll/read */ - wait_queue_head_t fd_wqh; - /* waitqueue head for events */ - wait_queue_head_t event_wqh; - /* a refile sequence protected by fault_pending_wqh lock */ - seqcount_spinlock_t refile_seq; - /* pseudo fd refcounting */ - refcount_t refcount; - /* userfaultfd syscall flags */ - unsigned int flags; - /* features requested from the userspace */ - unsigned int features; - /* released */ - bool released; - /* memory mappings are changing because of non-cooperative event */ - atomic_t mmap_changing; - /* mm with one ore more vmas attached to this userfaultfd_ctx */ - struct mm_struct *mm; -}; - struct userfaultfd_fork_ctx { struct userfaultfd_ctx *orig; struct userfaultfd_ctx *new; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index aa8e3725f10347bf08304853e710e2f910d68607..791af86a89ccd9ace59b951c64464764ffd5a576 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,6 +36,45 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + * + * Locking order: + * fd_wqh.lock + * fault_pending_wqh.lock + * fault_wqh.lock + * event_wqh.lock + * + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's + * also taken in IRQ context. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + seqcount_spinlock_t refile_seq; + /* pseudo fd refcounting */ + refcount_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* features requested from the userspace */ + unsigned int features; + /* released */ + bool released; + /* memory mappings are changing because of non-cooperative event */ + atomic_t mmap_changing; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ @@ -184,6 +223,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; + + return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -307,6 +353,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28dda79997820f95a241fed211ce489cd80dc19..ccb517f35442757085d7d1886d140bdc0150952e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2503,6 +2503,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; } +static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{ + if (pmd_present(pmd)) + pmd = pmd_clear_uffd_wp(pmd); + else if (is_swap_pmd(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2541,6 +2551,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd); + if (vma_has_uffd_without_event_remap(vma)) + pmd = clear_uffd_wp_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1cd1196e0d66fba77811cf211c9e4f22c9d04b73..2d21e78d243f727b9b0e7a51853e1b0a2646ac15 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5542,6 +5542,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; @@ -5558,7 +5559,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + huge_pte_clear(mm, new_addr, dst_pte, sz); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = huge_pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + } if (src_ptl != dst_ptl) spin_unlock(src_ptl); diff --git a/mm/mremap.c b/mm/mremap.c index 7d12ba7b3e8b65a4f6fc1b0a7b8c3176125e26ed..f1280c89cb34d5e0d985e78512990b5b864bffa1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -161,6 +161,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_ptep, *new_ptep; pte_t old_pte, pte; @@ -239,7 +240,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + pte_clear(mm, new_addr, new_ptep); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); + } } arch_leave_lazy_mmu_mode(); @@ -301,6 +313,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; + /* If this pmd belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -356,6 +377,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; + /* If this pud belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock.