diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8f3b474f3a70be44ad320d4d61fc6a49a5adb554..89fd528e8ef43775758174cb58326344d8b37344 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -124,7 +124,7 @@ enum cpuhp_state { CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, + CPUHP_MM_ZSWP_MEM_PREPARE, /* keep this to make kabi stable */ CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index baa8d4aa009ad294d13a325874abd2aef2444369..230fd4b2fedf0d7f1bbde965a1b9e880340b2672 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -287,6 +287,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio); struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid); @@ -543,6 +544,8 @@ static inline struct folio *alloc_hugetlb_folio_size(int nid, unsigned long size return NULL; } +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6120457bd7e3bf9f67947c60ef0f788bb53d3945..b00dda9db0c9fe8aa747ab30e7e0aa2a54f6383b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -104,7 +104,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) @@ -5326,26 +5326,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + vma_assert_write_locked(vma); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -7780,9 +7794,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -7806,8 +7827,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) @@ -7817,8 +7842,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. @@ -7833,7 +7860,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); } #ifdef CONFIG_CMA diff --git a/mm/mmap.c b/mm/mmap.c index bd99a80d4a9326f0c5e9c7452427b05720919fe7..6d9e6ab2bb2771ef3941345a9e37ab77bc28ddf4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2531,7 +2531,13 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + if (is_vm_hugetlb_page(vma)) + hugetlb_split(vma, addr); if (vma_is_peer_shared(vma)) vm_object_split(vma, new); diff --git a/mm/zswap.c b/mm/zswap.c index 8e28011bfbd80548cb41599ffc84b73525e9b245..8f7a8aa6d439e792ecb799dfc8b6f3f8b781b98d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -159,8 +159,8 @@ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; - u8 *dstmem; - struct mutex *mutex; + u8 *buffer; + struct mutex mutex; }; /* @@ -448,76 +448,45 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, /********************************* * per-cpu code **********************************/ -static DEFINE_PER_CPU(u8 *, zswap_dstmem); -/* - * If users dynamically change the zpool type and compressor at runtime, i.e. - * zswap is running, zswap can have more than one zpool on one cpu, but they - * are sharing dtsmem. So we need this mutex to be per-cpu. - */ -static DEFINE_PER_CPU(struct mutex *, zswap_mutex); - -static int zswap_dstmem_prepare(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) - return -ENOMEM; - - mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); - if (!mutex) { - kfree(dst); - return -ENOMEM; - } - - mutex_init(mutex); - per_cpu(zswap_dstmem, cpu) = dst; - per_cpu(zswap_mutex, cpu) = mutex; - return 0; -} - -static int zswap_dstmem_dead(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - mutex = per_cpu(zswap_mutex, cpu); - kfree(mutex); - per_cpu(zswap_mutex, cpu) = NULL; - - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - - return 0; -} - static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; + int ret; + + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { + ret = -ENOMEM; + goto fail; + } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); - return PTR_ERR(acomp); + ret = PTR_ERR(acomp); + goto fail; } - acomp_ctx->acomp = acomp; - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - crypto_free_acomp(acomp_ctx->acomp); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -526,10 +495,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); - acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); - + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->req = req; + mutex_unlock(&acomp_ctx->mutex); return 0; + +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); + return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) @@ -537,16 +513,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + mutex_lock(&acomp_ctx->mutex); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); + acomp_ctx->req = NULL; if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); } + mutex_unlock(&acomp_ctx->mutex); return 0; } +static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) +{ + struct crypto_acomp_ctx *acomp_ctx; + + for (;;) { + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + if (likely(acomp_ctx->req)) + return acomp_ctx; + /* + * It is possible that we were migrated to a different CPU after + * getting the per-CPU ctx but before the mutex was acquired. If + * the old CPU got offlined, zswap_cpu_comp_dead() could have + * already freed ctx->req (among other things) and set it to + * NULL. Just try again on the new CPU that we ended up on. + */ + mutex_unlock(&acomp_ctx->mutex); + } +} + +static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) +{ + mutex_unlock(&acomp_ctx->mutex); +} + /********************************* * pool functions **********************************/ @@ -722,7 +727,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; + int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we @@ -760,6 +765,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) @@ -1127,14 +1135,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zpool_unmap_handle(pool, entry->handle); } - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, &folio->page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); if (!zpool_can_sleep_mapped(pool)) kfree(tmp); @@ -1297,15 +1305,16 @@ bool zswap_store(struct folio *folio) goto freepage; /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(acomp_ctx->mutex); - - dst = acomp_ctx->dstmem; + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -1343,7 +1352,7 @@ bool zswap_store(struct folio *folio) buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); - mutex_unlock(acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); /* populate entry */ entry->swpentry = swp_entry(type, offset); @@ -1386,7 +1395,7 @@ bool zswap_store(struct folio *folio) return true; put_dstmem: - mutex_unlock(acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1469,20 +1478,19 @@ bool zswap_load(struct folio *folio) zpool_unmap_handle(zpool, entry->handle); } - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)) WARN_ON(1); - mutex_unlock(acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); else kfree(tmp); + acomp_ctx_put_unlock(acomp_ctx); ret = true; stats: @@ -1617,13 +1625,6 @@ static int zswap_setup(void) goto cache_fail; } - ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", - zswap_dstmem_prepare, zswap_dstmem_dead); - if (ret) { - pr_err("dstmem alloc failed\n"); - goto dstmem_fail; - } - ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, @@ -1656,8 +1657,6 @@ static int zswap_setup(void) if (pool) zswap_pool_destroy(pool); hp_fail: - cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); -dstmem_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */