diff --git a/include/linux/swap.h b/include/linux/swap.h index 9ac07b26b4724d4930724f7da8c375f74fa710f0..bea0c0f1f640552e399ae08e48da79046e413176 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -507,9 +507,9 @@ extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order, int type); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -583,7 +583,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } @@ -592,7 +592,7 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline int swapcache_prepare(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 447662e1d76f62a084a1bb009e65d9be5a3782dc..437b4ee42db8057355af564e149f810e51dbe26d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -90,9 +90,11 @@ struct writeback_control { size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif - KABI_RESERVE(1) + /* Target list for splitting a large folio */ + KABI_USE(1, struct list_head *list) KABI_RESERVE(2) KABI_RESERVE(3) + }; static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc) diff --git a/mm/filemap.c b/mm/filemap.c index 1d5d5f1c2b5412cc763b86506a6c30e2c8f80298..c3cb53a9c867fadae748aee7d3e1e681a0200834 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2048,17 +2048,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2090,10 +2093,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2102,7 +2112,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xa_get_order(&mapping->i_pages, xas.xa_index); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2114,15 +2136,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } diff --git a/mm/memory.c b/mm/memory.c index 84ff5ac8fb968970d9bc8c8f26d9af74a0dd7294..d52c7d48f9d824c612ccad9aa14a9dd0386321ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4037,7 +4037,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * reusing the same entry. It's undetectable as * pte_same() returns true due to entry reuse. */ - if (swapcache_prepare(entry)) { + if (swapcache_prepare(entry, 1)) { /* Relax a bit to prevent rapid repeated page faults */ schedule_timeout_uninterruptible(1); goto out; @@ -4344,7 +4344,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, 1); if (si) put_swap_device(si); return ret; @@ -4360,7 +4360,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_put(swapcache); } if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, 1); if (si) put_swap_device(si); return ret; diff --git a/mm/shmem.c b/mm/shmem.c index 7b3fa6b9aa289b5b7b123223c6e00c3e9f626a33..b7a804a4c907f2583c14177dd9e4c31150b3786b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -155,7 +155,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, vm_fault_t *fault_type); + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -780,7 +780,6 @@ static int shmem_add_to_page_cache(struct folio *folio, VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; @@ -838,23 +837,27 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); shmem_reliable_folio_add(folio, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -877,7 +880,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) - swapped++; + swapped += 1 << xa_get_order(xas.xa, xas.xa_index); if (xas.xa_index == max) break; if (need_resched()) { @@ -1006,7 +1009,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1073,14 +1076,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1439,6 +1445,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); swp_entry_t swap; pgoff_t index; + int nr_pages; + bool split = false; /* * Our capabilities prevent regular writeback or sync from ever calling @@ -1457,20 +1465,33 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_huge_page_to_list_to_order(page, wbc->list, 0)) goto redirty; folio = page_folio(page); folio_clear_dirty(folio); } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1505,8 +1526,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } swap = folio_alloc_swap(folio); - if (!swap.val) + if (!swap.val) { + if (nr_pages > 1) + goto try_split; + goto redirty; + } /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1523,8 +1548,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(swap); + shmem_recalc_inode(inode, 0, nr_pages); + swap_shmem_alloc(swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); @@ -1888,30 +1913,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swp_offset(entry); - swap_mapping = swap_address_space(entry); + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + struct address_space *swap_mapping = swap_address_space(entry); + pgoff_t swap_index = swp_offset(entry); + XA_STATE(xas, &swap_mapping->i_pages, swap_index); + int nr_pages = folio_nr_pages(old); + int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, 0, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1921,20 +1951,27 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ + /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); + for (i = 0; i < nr_pages; i++) { + void *item = xas_load(&xas); + + if (item != old) { + error = -ENOENT; + break; + } + + xas_store(&xas, new); + xas_next(&xas); + } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - shmem_reliable_folio_add(new, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); - shmem_reliable_folio_add(old, -1); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); + shmem_reliable_folio_add(new, nr_pages); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); + __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_reliable_folio_add(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1954,7 +1991,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } @@ -1964,6 +2006,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1972,6 +2015,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); delete_from_swap_cache(folio); /* @@ -1979,8 +2023,86 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + void *alloced_shadow = NULL; + int alloced_order = 0, i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + int order = -1, split_order = 0; + void *old = NULL; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + order = xas_get_order(&xas); + + /* Swap entry may have changed before we re-acquire the lock */ + if (alloced_order && + (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; + } + + /* Try to split large swap entry in pagecache */ + if (order > 0) { + if (!alloced_order) { + split_order = order; + goto unlock; + } + xas_split(&xas, old, order); + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << order; i++) { + pgoff_t aligned_index = round_down(index, 1 << order); + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + } + +unlock: + xas_unlock_irq(&xas); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } + +error: + if (xas_error(&xas)) + return xas_error(&xas); + + return alloced_order; } /* @@ -1991,15 +2113,16 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, + gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; - int error; + int error, nr_pages; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2019,12 +2142,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int split_order; + /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + + /* + * Now swap device can only swap in order 0 folio, then we + * should split the large swap entry stored in the pagecache + * if necessary. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Here we actually start the io */ folio = shmem_swapin(swap, gfp, info, index); if (!folio) { @@ -2046,6 +2194,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); + nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -2054,24 +2203,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } - error = shmem_add_to_page_cache(folio, mapping, index, + error = shmem_add_to_page_cache(folio, mapping, + round_down(index, nr_pages), swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); delete_from_swap_cache(folio); folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -2134,7 +2284,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, fault_mm, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; diff --git a/mm/swap.h b/mm/swap.h index 693d1b2815598f67ac28bdd6c16e14b02612c703..500f99202776627be33e61be98d30931872e7c2b 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -38,7 +38,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -97,7 +97,7 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { } @@ -149,4 +149,5 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return 0; } #endif /* CONFIG_SWAP */ + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 94c9f171e94da3ba52e00c8e66906d1320f7a1ab..bd2bc1d4cd868c0896f214f0160a43d8620c01e1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -477,7 +477,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry); + err = swapcache_prepare(entry, 1); if (!err) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index cfb768d3ed730b20cab6ac223921d95b98fdeaeb..14d4dfad015eb6c822f9cc62b27bb8f70164e520 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1427,7 +1427,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) } static void cluster_swap_free_nr(struct swap_info_struct *sis, - unsigned long offset, int nr_pages) + unsigned long offset, int nr_pages, + unsigned char usage) { struct swap_cluster_info *ci; DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; @@ -1437,7 +1438,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *sis, while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(sis, offset + i, 1)) + if (!__swap_entry_free_locked(sis, offset + i, usage)) bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { @@ -1471,7 +1472,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr); + cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } @@ -3461,7 +3462,7 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that a swap entry is valid and increment its swap map count. + * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 @@ -3471,59 +3472,73 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { struct swap_info_struct *p; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; - int err; + int err, i; p = swp_swap_info(entry); offset = swp_offset(entry); + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + VM_WARN_ON(usage == 1 && nr > 1); ci = lock_cluster_or_swap_info(p, offset); - count = p->swap_map[offset]; - - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. - */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; - } - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; err = 0; + for (i = 0; i < nr; i++) { + count = p->swap_map[offset + i]; - if (usage == SWAP_HAS_CACHE) { + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } - /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) - has_cache = SWAP_HAS_CACHE; - else if (has_cache) /* someone else added cache */ - err = -EEXIST; - else /* no users remaining */ + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (!count && !has_cache) { err = -ENOENT; + } else if (usage == SWAP_HAS_CACHE) { + if (has_cache) + err = -EEXIST; + } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + err = -EINVAL; + } - } else if (count || has_cache) { + if (err) + goto unlock_out; + } + + for (i = 0; i < nr; i++) { + count = p->swap_map[offset + i]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if (usage == SWAP_HAS_CACHE) + has_cache = SWAP_HAS_CACHE; + else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) - err = -EINVAL; - else if (swap_count_continued(p, offset, count)) + else if (swap_count_continued(p, offset + i, count)) count = COUNT_CONTINUED; - else + else { + /* + * Don't need to rollback changes, because if + * usage == 1, there must be nr == 1. + */ err = -ENOMEM; - } else - err = -ENOENT; /* unused swap entry */ + goto unlock_out; + } - WRITE_ONCE(p->swap_map[offset], count | has_cache); + WRITE_ONCE(p->swap_map[offset + i], count | has_cache); + } unlock_out: unlock_cluster_or_swap_info(p, ci); @@ -3534,9 +3549,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ -void swap_shmem_alloc(swp_entry_t entry) +void swap_shmem_alloc(swp_entry_t entry, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM); + __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* @@ -3550,35 +3565,29 @@ int swap_duplicate(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* - * @entry: swap entry for which we allocate swap cache. + * @entry: first swap entry from which we allocate nr swap cache. * - * Called when allocating swap cache for existing swap entry, + * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ -int swapcache_prepare(swp_entry_t entry) +int swapcache_prepare(swp_entry_t entry, int nr) { - return __swap_duplicate(entry, SWAP_HAS_CACHE); + return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); - usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); - if (!usage) - free_swap_slot(entry); + cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3337907ae5e9dc6f993975744e4db157ad631086..99cfd0135aad13d471b0467cb9dfd12ebcb17ea5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1307,7 +1307,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, - struct swap_iocb **plug) + struct swap_iocb **plug, struct list_head *folio_list) { /* * If the folio is dirty, only perform writeback if that write @@ -1355,6 +1355,14 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, .swap_plug = plug, }; + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is + * not enabled or contiguous swap entries are failed to + * allocate. + */ + if (shmem_mapping(mapping) && folio_test_large(folio)) + wbc.list = folio_list; + folio_set_reclaim(folio); res = mapping->a_ops->writepage(&folio->page, &wbc); if (res < 0) @@ -1931,11 +1939,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, goto activate_locked_split; } } - } else if (folio_test_swapbacked(folio) && - folio_test_large(folio)) { - /* Split shmem folio */ - if (split_folio_to_list(folio, folio_list)) - goto keep_locked; } /* @@ -2036,12 +2039,25 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping, &plug)) { + switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio))