diff --git a/include/linux/swap.h b/include/linux/swap.h index bea0c0f1f640552e399ae08e48da79046e413176..33396153afc0a38413105e8b571a3ca806d74503 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -255,22 +255,24 @@ enum { * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * - * The data field stores next cluster if the cluster is free or cluster usage - * counter otherwise. The flags field determines if a cluster is free. This is - * protected by swap_info_struct.lock. + * The flags field determines if a cluster is free. This is + * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields - * and swap_info_struct->swap_map - * elements correspond to the swap - * cluster + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. */ - unsigned int data:24; - unsigned int flags:8; + u16 count; + u8 flags; + u8 order; + struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ /* * The first page in the swap file is the swap header, which is always marked @@ -295,11 +297,6 @@ struct percpu_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; -struct swap_cluster_list { - struct swap_cluster_info head; - struct swap_cluster_info tail; -}; - /* * The in-memory structure used to track swap areas. */ @@ -312,7 +309,13 @@ struct swap_info_struct { unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_list free_clusters; /* free clusters list */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ @@ -345,7 +348,8 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde19ac90cebb9b16aaba8b6a99c4ab..a13d2d2d91311659d18a4fbbeaaf7ca45bdf32b9 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages; bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); -void zswap_invalidate(int type, pgoff_t offset); +void zswap_invalidate(swp_entry_t swp); void zswap_swapon(int type); void zswap_swapoff(int type); @@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio) return false; } -static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_invalidate(swp_entry_t swp) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 7af3b93d4c8c8a15bcfbf3adce5270830c2ab9d9..5579eed7065f8353df89b6c6b1f48b563b998700 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -34,6 +34,7 @@ #include #include #include +#include static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); #ifdef CONFIG_MEMCG_SWAP_QOS @@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; + /* Large folio swap slot is not covered. */ + zswap_invalidate(entry); + cache = raw_cpu_ptr(&swp_slots); if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 3af5b6ebb2412735e386f293eae789188a0350e6..3b48159820f2a1f7237ba5e6f4a56c8ea1266df8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,6 +52,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); +static bool folio_swapcache_freeable(struct folio *folio); +static struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, unsigned long offset); +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -126,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent) * corresponding page */ #define TTRS_UNMAPPED 0x2 -/* Reclaim the swap entry if swap is getting full*/ +/* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 +/* Reclaim directly, bypass the slot cache and don't touch device lock */ +#define TTRS_DIRECT 0x8 + +static bool swap_is_has_cache(struct swap_info_struct *si, + unsigned long offset, int nr_pages) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + + do { + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); + if (*map != SWAP_HAS_CACHE) + return false; + } while (++map < map_end); + + return true; +} /* * returns number of pages in the folio that backs the swap entry. If positive, @@ -138,12 +164,19 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); + struct address_space *address_space = swap_address_space(entry); + struct swap_cluster_info *ci; struct folio *folio; - int ret = 0; + int ret, nr_pages; + bool need_reclaim; - folio = filemap_get_folio(swap_address_space(entry), offset); + folio = filemap_get_folio(address_space, offset); if (IS_ERR(folio)) return 0; + + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock @@ -151,14 +184,54 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (folio_trylock(folio)) { - if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) - ret = folio_free_swap(folio); - folio_unlock(folio); + if (!folio_trylock(folio)) + goto out; + + /* offset could point to the middle of a large folio */ + entry = folio->swap; + offset = swp_offset(entry); + + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); + if (!need_reclaim || !folio_swapcache_freeable(folio)) + goto out_unlock; + + /* + * It's safe to delete the folio from swap cache only if the folio's + * swap_map is HAS_CACHE only, which means the slots have no page table + * reference or pending writeback, and can't be allocated to others. + */ + ci = lock_cluster_or_swap_info(si, offset); + need_reclaim = swap_is_has_cache(si, offset, nr_pages); + unlock_cluster_or_swap_info(si, ci); + if (!need_reclaim) + goto out_unlock; + + if (!(flags & TTRS_DIRECT)) { + /* Free through slot cache */ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + ret = nr_pages; + goto out_unlock; } - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); + + xa_lock_irq(&address_space->i_pages); + __delete_from_swap_cache(folio, entry, NULL); + xa_unlock_irq(&address_space->i_pages); + folio_ref_sub(folio, nr_pages); + folio_set_dirty(folio); + + spin_lock(&si->lock); + /* Only sinple page folio can be backed by zswap */ + if (nr_pages == 1) + zswap_invalidate(entry); + swap_entry_range_free(si, entry, nr_pages); + spin_unlock(&si->lock); + ret = nr_pages; +out_unlock: + folio_unlock(folio); +out: folio_put(folio); return ret; } @@ -289,62 +362,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, #endif #define LATENCY_LIMIT 256 -static inline void cluster_set_flag(struct swap_cluster_info *info, - unsigned int flag) -{ - info->flags = flag; -} - -static inline unsigned int cluster_count(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_count(struct swap_cluster_info *info, - unsigned int c) -{ - info->data = c; -} - -static inline void cluster_set_count_flag(struct swap_cluster_info *info, - unsigned int c, unsigned int f) -{ - info->flags = f; - info->data = c; -} - -static inline unsigned int cluster_next(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_next(struct swap_cluster_info *info, - unsigned int n) -{ - info->data = n; -} - -static inline void cluster_set_next_flag(struct swap_cluster_info *info, - unsigned int n, unsigned int f) -{ - info->flags = f; - info->data = n; -} - static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } -static inline bool cluster_is_null(struct swap_cluster_info *info) +static inline unsigned int cluster_index(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - return info->flags & CLUSTER_FLAG_NEXT_NULL; + return ci - si->cluster_info; } -static inline void cluster_set_null(struct swap_cluster_info *info) +static inline unsigned int cluster_offset(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - info->flags = CLUSTER_FLAG_NEXT_NULL; - info->data = 0; + return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, @@ -393,65 +425,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, spin_unlock(&si->lock); } -static inline bool cluster_list_empty(struct swap_cluster_list *list) -{ - return cluster_is_null(&list->head); -} - -static inline unsigned int cluster_list_first(struct swap_cluster_list *list) -{ - return cluster_next(&list->head); -} - -static void cluster_list_init(struct swap_cluster_list *list) -{ - cluster_set_null(&list->head); - cluster_set_null(&list->tail); -} - -static void cluster_list_add_tail(struct swap_cluster_list *list, - struct swap_cluster_info *ci, - unsigned int idx) -{ - if (cluster_list_empty(list)) { - cluster_set_next_flag(&list->head, idx, 0); - cluster_set_next_flag(&list->tail, idx, 0); - } else { - struct swap_cluster_info *ci_tail; - unsigned int tail = cluster_next(&list->tail); - - /* - * Nested cluster lock, but both cluster locks are - * only acquired when we held swap_info_struct->lock - */ - ci_tail = ci + tail; - spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); - cluster_set_next(ci_tail, idx); - spin_unlock(&ci_tail->lock); - cluster_set_next_flag(&list->tail, idx, 0); - } -} - -static unsigned int cluster_list_del_first(struct swap_cluster_list *list, - struct swap_cluster_info *ci) -{ - unsigned int idx; - - idx = cluster_next(&list->head); - if (cluster_next(&list->tail) == idx) { - cluster_set_null(&list->head); - cluster_set_null(&list->tail); - } else - cluster_set_next_flag(&list->head, - cluster_next(&ci[idx]), 0); - - return idx; -} - /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, - unsigned int idx) + struct swap_cluster_info *ci) { + unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't @@ -461,17 +439,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); - + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + list_move_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; schedule_work(&si->discard_work); } -static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); - cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); - cluster_list_add_tail(&si->free_clusters, ci, idx); + if (ci->flags) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; + ci->order = 0; } /* @@ -480,24 +464,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info, *ci; + struct swap_cluster_info *ci; unsigned int idx; - info = si->cluster_info; - - while (!cluster_list_empty(&si->discard_clusters)) { - idx = cluster_list_del_first(&si->discard_clusters, info); + while (!list_empty(&si->discard_clusters)) { + ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + list_del(&ci->list); + idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - __free_cluster(si, idx); + spin_lock(&ci->lock); + __free_cluster(si, ci); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); + spin_unlock(&ci->lock); } } @@ -520,20 +504,15 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; + VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); - VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); - cluster_list_del_first(&si->free_clusters, ci); - cluster_set_count_flag(ci + idx, 0, 0); -} - -static void free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - struct swap_cluster_info *ci = si->cluster_info + idx; + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; - VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -541,160 +520,374 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(si, idx); + swap_cluster_schedule_discard(si, ci); return; } - __free_cluster(si, idx); + __free_cluster(si, ci); } /* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by - * count. + * The cluster corresponding to page_nr will be used. The cluster will not be + * added to free cluster list and its usage counter will be increased by 1. + * Only used for initialization. */ -static void add_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr, - unsigned long count) +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) - alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + count); -} + ci = cluster_info + idx; + ci->count++; -/* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by 1. - */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) -{ - add_cluster_info_page(p, cluster_info, page_nr, 1); + VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); + VM_BUG_ON(ci->flags); } /* - * The cluster corresponding to page_nr decreases one usage. If the usage - * counter becomes 0, which means no page in the cluster is in using, we can - * optionally discard the cluster and add it to free cluster list. + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, + * which means no page in the cluster is in use, we can optionally discard + * the cluster and add it to free cluster list. */ static void dec_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) + struct swap_cluster_info *ci, int nr_pages) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; - - if (!cluster_info) + if (!p->cluster_info) return; - VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) - 1); + VM_BUG_ON(ci->count < nr_pages); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&p->lock); + lockdep_assert_held(&ci->lock); + ci->count -= nr_pages; + + if (!ci->count) { + free_cluster(p, ci); + return; + } - if (cluster_count(&cluster_info[idx]) == 0) - free_cluster(p, idx); + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + if (ci->flags & CLUSTER_FLAG_FRAG) + p->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } } -/* - * It's possible scan_swap_map_slots() uses a free cluster in the middle of free - * cluster list. Avoiding such abuse to avoid list corruption. - */ -static bool -scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset, int order) +static bool cluster_reclaim_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned long end) { - struct percpu_cluster *percpu_cluster; - bool conflict; + unsigned char *map = si->swap_map; + unsigned long offset; - offset /= SWAPFILE_CLUSTER; - conflict = !cluster_list_empty(&si->free_clusters) && - offset != cluster_list_first(&si->free_clusters) && - cluster_is_free(&si->cluster_info[offset]); + spin_unlock(&ci->lock); + spin_unlock(&si->lock); - if (!conflict) - return false; + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) + continue; + goto out; + default: + goto out; + } + } +out: + spin_lock(&si->lock); + spin_lock(&ci->lock); + + /* + * Recheck the range no matter reclaim succeeded or not, the slot + * could have been be freed while we are not holding the lock. + */ + for (offset = start; offset < end; offset++) + if (READ_ONCE(map[offset])) + return false; - percpu_cluster = this_cpu_ptr(si->percpu_cluster); - percpu_cluster->next[order] = SWAP_NEXT_INVALID; return true; } -static inline bool swap_range_empty(char *swap_map, unsigned int start, - unsigned int nr_pages) +static bool cluster_scan_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned int nr_pages) { - unsigned int i; + unsigned long offset, end = start + nr_pages; + unsigned char *map = si->swap_map; + bool need_reclaim = false; - for (i = 0; i < nr_pages; i++) { - if (swap_map[start + i]) + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (!vm_swap_full()) + return false; + need_reclaim = true; + continue; + default: return false; + } } + if (need_reclaim) + return cluster_reclaim_range(si, ci, start, end); + return true; } +static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) +{ + unsigned int nr_pages = 1 << order; + + if (!(si->flags & SWP_WRITEOK)) + return false; + + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + ci->order = order; + } + + memset(si->swap_map + start, usage, nr_pages); + swap_range_alloc(si, start, nr_pages); + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &si->full_clusters); + ci->flags = CLUSTER_FLAG_FULL; + } + + return true; +} + +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, + unsigned int *foundp, unsigned int order, + unsigned char usage) +{ + unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int nr_pages = 1 << order; + struct swap_cluster_info *ci; + + if (end < nr_pages) + return SWAP_NEXT_INVALID; + end -= nr_pages; + + ci = lock_cluster(si, offset); + if (ci->count + nr_pages > SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + + while (offset <= end) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { + if (!cluster_alloc_range(si, ci, offset, usage, order)) { + offset = SWAP_NEXT_INVALID; + goto done; + } + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + offset += nr_pages; + break; + } + offset += nr_pages; + } + if (offset > end) + offset = SWAP_NEXT_INVALID; +done: + unlock_cluster(ci); + return offset; +} + +/* Return true if reclaimed a whole cluster */ +static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) +{ + long to_scan = 1; + unsigned long offset, end; + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + int nr_reclaim; + + if (force) + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + + while (!list_empty(&si->full_clusters)) { + ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->full_clusters); + offset = cluster_offset(si, ci); + end = min(si->max, offset + SWAPFILE_CLUSTER); + to_scan--; + + spin_unlock(&si->lock); + while (offset < end) { + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + nr_reclaim = __try_to_reclaim_swap(si, offset, + TTRS_ANYWAY | TTRS_DIRECT); + if (nr_reclaim) { + offset += abs(nr_reclaim); + continue; + } + } + offset++; + } + spin_lock(&si->lock); + + if (to_scan <= 0) + break; + } +} + +static void swap_reclaim_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, reclaim_work); + + spin_lock(&si->lock); + swap_reclaim_full_clusters(si, true); + spin_unlock(&si->lock); +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ -static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base, int order) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, + unsigned char usage) { - unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned int tmp, max; + unsigned int offset, found = 0; new_cluster: + lockdep_assert_held(&si->lock); cluster = this_cpu_ptr(si->percpu_cluster); - tmp = cluster->next[order]; - if (tmp == SWAP_NEXT_INVALID) { - if (!cluster_list_empty(&si->free_clusters)) { - tmp = cluster_next(&si->free_clusters.head) * - SWAPFILE_CLUSTER; - } else if (!cluster_list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); - *scan_base = this_cpu_read(*si->cluster_next_cpu); - *offset = *scan_base; - goto new_cluster; - } else - return false; + offset = cluster->next[order]; + if (offset) { + offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + if (found) + goto done; } - /* - * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster, maintaining - * natural alignment. - */ - max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); - if (tmp < max) { - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (swap_range_empty(si->swap_map, tmp, nr_pages)) + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + /* + * Either we didn't touch the cluster due to swapoff, + * or the allocation must success. + */ + VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); + goto done; + } + + /* Try reclaim from full clusters if free clusters list is drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + + if (order < PMD_ORDER) { + unsigned int frags = 0; + + while (!list_empty(&si->nonfull_clusters[order])) { + ci = list_first_entry(&si->nonfull_clusters[order], + struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->frag_clusters[order]); + ci->flags = CLUSTER_FLAG_FRAG; + si->frag_cluster_nr[order]++; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) break; - tmp += nr_pages; } - unlock_cluster(ci); + + if (!found) { + /* + * Nonfull clusters are moved to frag tail if we reached + * here, count them too, don't over scan the frag list. + */ + while (frags < si->frag_cluster_nr[order]) { + ci = list_first_entry(&si->frag_clusters[order], + struct swap_cluster_info, list); + /* + * Rotate the frag list to iterate, they were all failing + * high order allocation or moved here due to per-CPU usage, + * this help keeping usable cluster ahead. + */ + list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) + break; + } + } } - if (tmp >= max) { - cluster->next[order] = SWAP_NEXT_INVALID; + + if (found) + goto done; + + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + swap_do_scheduled_discard(si); goto new_cluster; } - *offset = tmp; - *scan_base = tmp; - tmp += nr_pages; - cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; - return true; + + if (order) + goto done; + + /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { + /* + * Clusters here have at least one usable slots and can't fail order 0 + * allocation, but reclaim may drop si->lock and race with another user. + */ + while (!list_empty(&si->frag_clusters[o])) { + ci = list_first_entry(&si->frag_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + + while (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + } + +done: + cluster->next[order] = offset; + return found; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -727,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); + + if (si->cluster_info && vm_swap_full()) + schedule_work(&si->reclaim_work); } } @@ -765,7 +961,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; @@ -816,11 +1011,33 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, return false; } +static int cluster_alloc_swap(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) +{ + int n_ret = 0; + + VM_BUG_ON(!si->cluster_info); + + si->flags += SWP_SCANNING; + + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + + if (!offset) + break; + slots[n_ret++] = swp_entry(si->type, offset); + } + + si->flags -= SWP_SCANNING; + + return n_ret; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; @@ -859,26 +1076,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } + if (si->cluster_info) + return cluster_alloc_swap(si, usage, nr, slots, order); + si->flags += SWP_SCANNING; - /* - * Use percpu scan base for SSD to reduce lock contention on - * cluster and swap cache. For HDD, sequential access is more - * important. - */ - if (si->flags & SWP_SOLIDSTATE) - scan_base = this_cpu_read(*si->cluster_next_cpu); - else - scan_base = si->cluster_next; + + /* For HDD, sequential access is more important. */ + scan_base = si->cluster_next; offset = scan_base; - /* SSD algorithm */ - if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } else if (unlikely(!si->cluster_nr--)) { + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -889,8 +1096,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info - * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; @@ -918,19 +1123,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } checks: - if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { - /* take a break if we already got some slots */ - if (n_ret) - goto done; - if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } - } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -938,13 +1130,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) @@ -953,15 +1143,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } if (si->swap_map[offset]) { - unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); - add_cluster_info_page(si, si->cluster_info, offset, nr_pages); - unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); @@ -982,13 +1169,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, latency_ration = LATENCY_LIMIT; } - /* try to get more slots in cluster */ - if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) - goto checks; - if (order > 0) - goto done; - } else if (si->cluster_nr && !si->swap_map[++offset]) { + if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; @@ -1049,19 +1230,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return n_ret; } -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - unsigned long offset = idx * SWAPFILE_CLUSTER; - struct swap_cluster_info *ci; - - ci = lock_cluster(si, offset); - memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); - cluster_set_count_flag(ci, 0, 0); - free_cluster(si, idx); - unlock_cluster(ci); - swap_range_free(si, offset, SWAPFILE_CLUSTER); -} - #ifdef CONFIG_MEMCG_SWAP_QOS int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) { @@ -1409,21 +1577,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, return usage; } -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +/* + * Drop the last HAS_CACHE flag of swap entries, caller have to + * ensure all entries belong to the same cgroup. + */ +static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, + unsigned int nr_pages) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char count; + unsigned char *map = p->swap_map + offset; + unsigned char *map_end = map + nr_pages; + struct swap_cluster_info *ci; ci = lock_cluster(p, offset); - count = p->swap_map[offset]; - VM_BUG_ON(count != SWAP_HAS_CACHE); - p->swap_map[offset] = 0; - dec_cluster_info_page(p, p->cluster_info, offset); + do { + VM_BUG_ON(*map != SWAP_HAS_CACHE); + *map = 0; + } while (++map < map_end); + dec_cluster_info_page(p, ci, nr_pages); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, 1); - swap_range_free(p, offset, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); + swap_range_free(p, offset, nr_pages); } static void cluster_swap_free_nr(struct swap_info_struct *sis, @@ -1484,12 +1659,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); - unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned char *map; - unsigned int i, free_entries = 0; - unsigned char val; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); @@ -1497,24 +1668,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster_or_swap_info(si, offset); - if (size == SWAPFILE_CLUSTER) { - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (free_entries == SWAPFILE_CLUSTER) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - return; - } + if (size > 1 && swap_is_has_cache(si, offset, size)) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; } - for (i = 0; i < size; i++, entry.val++) { + for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); @@ -1554,7 +1715,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) - swap_entry_free(p, entries[i]); + swap_entry_range_free(p, entries[i], 1); prev = p; } if (p) @@ -1674,16 +1835,7 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } -/** - * folio_free_swap() - Free the swap space used for this folio. - * @folio: The folio to remove. - * - * If swap is getting full, or if there are no more mappings of this folio, - * then call folio_free_swap to free its swap space. - * - * Return: true if we were able to release the swap space. - */ -bool folio_free_swap(struct folio *folio) +static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1691,8 +1843,6 @@ bool folio_free_swap(struct folio *folio) return false; if (folio_test_writeback(folio)) return false; - if (folio_swapped(folio)) - return false; /* * Once hibernation has begun to create its image of memory, @@ -1712,6 +1862,25 @@ bool folio_free_swap(struct folio *folio) if (pm_suspended_storage()) return false; + return true; +} + +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. + */ +bool folio_free_swap(struct folio *folio) +{ + if (!folio_swapcache_freeable(folio)) + return false; + if (folio_swapped(folio)) + return false; + delete_from_swap_cache(folio); folio_set_dirty(folio); return true; @@ -1788,7 +1957,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); + TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) @@ -2686,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) wait_for_completion(&p->comp); flush_work(&p->discard_work); + flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -3114,8 +3284,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = maxpages - 1; /* omit header page */ - cluster_list_init(&p->free_clusters); - cluster_list_init(&p->discard_clusters); + INIT_LIST_HEAD(&p->free_clusters); + INIT_LIST_HEAD(&p->full_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + + for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&p->nonfull_clusters[i]); + INIT_LIST_HEAD(&p->frag_clusters[i]); + p->frag_cluster_nr[i] = 0; + } for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -3158,7 +3335,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (!cluster_info) return nr_extents; - /* * Reduce false cache line sharing between cluster_info and * sharing same address space. @@ -3166,14 +3342,18 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; + ci = cluster_info + idx; if (idx >= nr_clusters) continue; - if (cluster_count(&cluster_info[idx])) + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &p->nonfull_clusters[0]); continue; - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, - idx); + } + ci->flags = CLUSTER_FLAG_FREE; + list_add_tail(&ci->list, &p->free_clusters); } } return nr_extents; @@ -3212,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return PTR_ERR(p); INIT_WORK(&p->discard_work, swap_discard_work); + INIT_WORK(&p->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { diff --git a/mm/zswap.c b/mm/zswap.c index 69681b9173fdcbf9f90967e21b5159760a16c601..5acda5b906bc43bcdc993dcdb9702e48307da0fa 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio) return ret; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = zswap_trees[type]; + pgoff_t offset = swp_offset(swp); + struct zswap_tree *tree = zswap_trees[swp_type(swp)]; struct zswap_entry *entry; /* find */