diff --git a/0006-kabi_test.patch b/0006-kabi_test.patch new file mode 100644 index 0000000000000000000000000000000000000000..68850724369f15dbf37378f44970dd91302faeda --- /dev/null +++ b/0006-kabi_test.patch @@ -0,0 +1,3831 @@ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index fcd0c3b2065d..a6bbe6029121 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2075,6 +2075,7 @@ config HYGON_CSV + bool "Hygon secure virtualization CSV support" + default y + depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT ++ select CONFIG_CMA + help + Hygon CSV integrates secure processor, memory encryption and + memory isolation to provide the ability to protect guest's private +diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig +index 8e8542796a13..adfaef0cb10c 100644 +--- a/arch/x86/configs/openeuler_defconfig ++++ b/arch/x86/configs/openeuler_defconfig +@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y + CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y + CONFIG_USE_PERCPU_NUMA_NODE_ID=y + CONFIG_HAVE_SETUP_PER_CPU_AREA=y +-# CONFIG_CMA is not set ++CONFIG_CMA=y ++# CONFIG_CMA_DEBUG is not set ++# CONFIG_CMA_DEBUGFS is not set ++# CONFIG_CMA_SYSFS is not set ++CONFIG_CMA_AREAS=19 + CONFIG_MEM_SOFT_DIRTY=y + CONFIG_GENERIC_EARLY_IOREMAP=y + CONFIG_DEFERRED_STRUCT_PAGE_INIT=y +@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y + CONFIG_SWIOTLB=y + # CONFIG_SWIOTLB_DYNAMIC is not set + CONFIG_DMA_COHERENT_POOL=y ++CONFIG_DMA_CMA=y ++# CONFIG_DMA_NUMA_CMA is not set ++ ++# ++# Default contiguous memory area size: ++# ++CONFIG_CMA_SIZE_MBYTES=0 ++CONFIG_CMA_SIZE_SEL_MBYTES=y ++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set ++# CONFIG_CMA_SIZE_SEL_MIN is not set ++# CONFIG_CMA_SIZE_SEL_MAX is not set ++CONFIG_CMA_ALIGNMENT=8 + # CONFIG_DMA_API_DEBUG is not set + # CONFIG_DMA_MAP_BENCHMARK is not set + CONFIG_SGL_ALLOC=y +diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c +index bb13894ad152..e87f878178f3 100644 +--- a/fs/nfs/nfs4super.c ++++ b/fs/nfs/nfs4super.c +@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server, + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- dentry->d_sb->s_flags = fc->sb_flags; ++ dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY); + fc->root = dentry; + return 0; + } +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 6e3227a688de..f3fd0407d346 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -325,6 +325,8 @@ struct cgroup_base_stat { + #ifdef CONFIG_SCHED_CORE + u64 forceidle_sum; + #endif ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + /* +@@ -555,6 +557,9 @@ struct cgroup { + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + /* All ancestors including self */ + struct cgroup *ancestors[]; + }; +@@ -573,6 +578,10 @@ struct cgroup_root { + /* Unique id for this hierarchy. */ + int hierarchy_id; + ++ /* A list running through the active hierarchies */ ++ struct list_head root_list; ++ struct rcu_head rcu; /* Must be near the top */ ++ + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. cgrp->ancestors[0] will be used overflowing into the +@@ -589,9 +598,6 @@ struct cgroup_root { + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + +- /* A list running through the active hierarchies */ +- struct list_head root_list; +- + /* Hierarchy-specific flags */ + unsigned int flags; + +@@ -605,6 +611,8 @@ struct cgroup_root { + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) + }; + + /* +diff --git a/include/linux/i2c.h b/include/linux/i2c.h +index 32cf5708d5a5..3fd6932bf8cd 100644 +--- a/include/linux/i2c.h ++++ b/include/linux/i2c.h +@@ -746,6 +746,9 @@ struct i2c_adapter { + + struct irq_domain *host_notify_domain; + struct regulator *bus_regulator; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) + +diff --git a/include/linux/iommu.h b/include/linux/iommu.h +index bb463cb96a44..83ec4bf9809e 100644 +--- a/include/linux/iommu.h ++++ b/include/linux/iommu.h +@@ -155,6 +155,10 @@ struct iopf_group { + KABI_USE(2, u32 cookie) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct iopf_group_extend { +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index b2a80e089a0a..abe236201e68 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -429,6 +429,14 @@ struct mem_cgroup { + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) ++ KABI_RESERVE(9) ++ KABI_RESERVE(10) ++ KABI_RESERVE(11) ++ KABI_RESERVE(12) ++ KABI_RESERVE(13) ++ KABI_RESERVE(14) ++ KABI_RESERVE(15) ++ KABI_RESERVE(16) + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 2e6ef9532fc3..b6dcdaafc592 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3819,24 +3819,22 @@ static inline bool page_is_guard(struct page *page) + return PageGuard(page); + } + +-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype); ++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); + static inline bool set_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) ++ unsigned int order) + { + if (!debug_guardpage_enabled()) + return false; +- return __set_page_guard(zone, page, order, migratetype); ++ return __set_page_guard(zone, page, order); + } + +-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype); ++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); + static inline void clear_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) ++ unsigned int order) + { + if (!debug_guardpage_enabled()) + return; +- __clear_page_guard(zone, page, order, migratetype); ++ __clear_page_guard(zone, page, order); + } + + #else /* CONFIG_DEBUG_PAGEALLOC */ +@@ -3846,9 +3844,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } + static inline bool debug_guardpage_enabled(void) { return false; } + static inline bool page_is_guard(struct page *page) { return false; } + static inline bool set_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) { return false; } ++ unsigned int order) { return false; } + static inline void clear_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) {} ++ unsigned int order) {} + #endif /* CONFIG_DEBUG_PAGEALLOC */ + + #ifdef __HAVE_ARCH_GATE_AREA +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 3cee238de7c8..18bee72ebc71 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -865,6 +865,7 @@ struct zone { + unsigned long watermark_boost; + + unsigned long nr_reserved_highatomic; ++ unsigned long nr_free_highatomic; + + /* + * We don't know if the memory that we're going to allocate will be +diff --git a/include/linux/msi.h b/include/linux/msi.h +index 7354ffb14856..5fd8a6caae98 100644 +--- a/include/linux/msi.h ++++ b/include/linux/msi.h +@@ -205,15 +205,12 @@ struct msi_desc { + union { + struct pci_msi_desc pci; + struct msi_desc_data data; +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) ++ KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5) + }; ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + KABI_RESERVE(5) +- KABI_RESERVE(6) +- KABI_RESERVE(7) +- KABI_RESERVE(8) + }; + + /* +diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h +index 4ac34392823a..c16db0067090 100644 +--- a/include/linux/page-isolation.h ++++ b/include/linux/page-isolation.h +@@ -34,8 +34,9 @@ static inline bool is_migrate_isolate(int migratetype) + #define REPORT_FAILURE 0x2 + + void set_pageblock_migratetype(struct page *page, int migratetype); +-int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype, int *num_movable); ++ ++bool move_freepages_block_isolate(struct zone *zone, struct page *page, ++ int migratetype); + + int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags); +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 429627abfef4..e44e377661f2 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -203,12 +203,21 @@ enum mapping_flags { + AS_EXITING = 4, /* final truncate in progress */ + /* writeback related tags are not used */ + AS_NO_WRITEBACK_TAGS = 5, +- AS_LARGE_FOLIO_SUPPORT = 6, +- AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ +- AS_STABLE_WRITES, /* must wait for writeback before modifying ++ AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ ++ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying + folio contents */ ++ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ ++ /* Bits 16-25 are used for FOLIO_ORDER */ ++ AS_FOLIO_ORDER_BITS = 5, ++ AS_FOLIO_ORDER_MIN = 16, ++ AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, + }; + ++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) ++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) ++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) ++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) ++ + /** + * mapping_set_error - record a writeback error in the address_space + * @mapping: the mapping in which an error should be set +@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) + #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) + ++/* ++ * mapping_set_folio_order_range() - Set the orders supported by a file. ++ * @mapping: The address space of the file. ++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). ++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). ++ * ++ * The filesystem should call this function in its inode constructor to ++ * indicate which base size (min) and maximum size (max) of folio the VFS ++ * can use to cache the contents of the file. This should only be used ++ * if the filesystem needs special handling of folio sizes (ie there is ++ * something the core cannot know). ++ * Do not tune it based on, eg, i_size. ++ * ++ * Context: This should not be called while the inode is active as it ++ * is non-atomic. ++ */ ++static inline void mapping_set_folio_order_range(struct address_space *mapping, ++ unsigned int min, ++ unsigned int max) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return; ++ ++ if (min > MAX_PAGECACHE_ORDER) ++ min = MAX_PAGECACHE_ORDER; ++ ++ if (max > MAX_PAGECACHE_ORDER) ++ max = MAX_PAGECACHE_ORDER; ++ ++ if (max < min) ++ max = min; ++ ++ mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | ++ (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); ++} ++ ++static inline void mapping_set_folio_min_order(struct address_space *mapping, ++ unsigned int min) ++{ ++ mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); ++} ++ + /** + * mapping_set_large_folios() - Indicate the file supports large folios. +- * @mapping: The file. ++ * @mapping: The address space of the file. + * + * The filesystem should call this function in its inode constructor to + * indicate that the VFS can use large folios to cache the contents of +@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + */ + static inline void mapping_set_large_folios(struct address_space *mapping) + { +- __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); ++} ++ ++static inline unsigned int ++mapping_max_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; ++} ++ ++static inline unsigned int ++mapping_min_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; + } + + /** +@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping) + static inline void mapping_clear_large_folios(struct address_space *mapping) + { + WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock)); +- __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, 0); + } + + /* +@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping) + */ + static inline bool mapping_large_folio_support(struct address_space *mapping) + { +- /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ ++ /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + +- return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && +- test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ return mapping_max_folio_order(mapping) > 0; + } + + /* Return the maximum folio size for this pagecache mapping, in bytes. */ +-static inline size_t mapping_max_folio_size(struct address_space *mapping) ++static inline size_t mapping_max_folio_size(const struct address_space *mapping) + { +- if (mapping_large_folio_support(mapping)) +- return PAGE_SIZE << MAX_PAGECACHE_ORDER; +- return PAGE_SIZE; ++ return PAGE_SIZE << mapping_max_folio_order(mapping); + } + + static inline int filemap_nr_thps(struct address_space *mapping) +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 89f2a02db563..fe692e9bd0b2 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context { + struct hrtimer hrtimer; + ktime_t hrtimer_interval; + unsigned int hrtimer_active; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + /** +@@ -1031,6 +1039,14 @@ struct perf_cpu_context { + int heap_size; + struct perf_event **heap; + struct perf_event *heap_default[2]; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct perf_output_handle { +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 8e4d9bbdaa40..09a2b2625202 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -85,6 +85,8 @@ struct sched_domain_shared { + #ifdef CONFIG_SCHED_STEAL + struct sparsemask *cfs_overload_cpus; + #endif ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + struct sched_domain { +@@ -154,6 +156,9 @@ struct sched_domain { + }; + struct sched_domain_shared *shared; + ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ + unsigned int span_weight; + /* + * Span of all CPUs in this domain. +diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h +index 234bcdb1fba4..cf4a2258df85 100644 +--- a/include/linux/seq_file.h ++++ b/include/linux/seq_file.h +@@ -27,6 +27,8 @@ struct seq_file { + int poll_event; + const struct file *file; + void *private; ++ ++ KABI_RESERVE(1) + }; + + struct seq_operations { +@@ -34,6 +36,8 @@ struct seq_operations { + void (*stop) (struct seq_file *m, void *v); + void * (*next) (struct seq_file *m, void *v, loff_t *pos); + int (*show) (struct seq_file *m, void *v); ++ ++ KABI_RESERVE(1) + }; + + #define SEQ_SKIP 1 +diff --git a/include/linux/stat.h b/include/linux/stat.h +index 52150570d37a..d342e89b7aaa 100644 +--- a/include/linux/stat.h ++++ b/include/linux/stat.h +@@ -53,6 +53,11 @@ struct kstat { + u32 dio_mem_align; + u32 dio_offset_align; + u64 change_cookie; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index bea0c0f1f640..33396153afc0 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -255,22 +255,24 @@ enum { + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * +- * The data field stores next cluster if the cluster is free or cluster usage +- * counter otherwise. The flags field determines if a cluster is free. This is +- * protected by swap_info_struct.lock. ++ * The flags field determines if a cluster is free. This is ++ * protected by cluster lock. + */ + struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields +- * and swap_info_struct->swap_map +- * elements correspond to the swap +- * cluster ++ * other than list, and swap_info_struct->swap_map ++ * elements corresponding to the swap cluster. + */ +- unsigned int data:24; +- unsigned int flags:8; ++ u16 count; ++ u8 flags; ++ u8 order; ++ struct list_head list; + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ ++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -295,11 +297,6 @@ struct percpu_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ + }; + +-struct swap_cluster_list { +- struct swap_cluster_info head; +- struct swap_cluster_info tail; +-}; +- + /* + * The in-memory structure used to track swap areas. + */ +@@ -312,7 +309,13 @@ struct swap_info_struct { + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ +- struct swap_cluster_list free_clusters; /* free clusters list */ ++ struct list_head free_clusters; /* free clusters list */ ++ struct list_head full_clusters; /* full clusters list */ ++ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that contains at least one free slot */ ++ struct list_head frag_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that are fragmented or contented */ ++ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +@@ -345,7 +348,8 @@ struct swap_info_struct { + * list. + */ + struct work_struct discard_work; /* discard worker */ +- struct swap_cluster_list discard_clusters; /* discard clusters list */ ++ struct work_struct reclaim_work; /* reclaim worker */ ++ struct list_head discard_clusters; /* discard clusters list */ + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) +diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h +index f46e0ca0169c..86d0868b584a 100644 +--- a/include/linux/uprobes.h ++++ b/include/linux/uprobes.h +@@ -47,6 +47,7 @@ struct uprobe_consumer { + + #ifdef CONFIG_UPROBES + #include ++#include + + enum uprobe_task_state { + UTASK_RUNNING, +@@ -78,6 +79,14 @@ struct uprobe_task { + + struct return_instance *return_instances; + unsigned int depth; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct return_instance { +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 343906a98d6e..735eae6e272c 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio, + mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); + } + +-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, +- int migratetype) +-{ +- __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); +- if (is_migrate_cma(migratetype)) +- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); +-} +- + extern const char * const vmstat_text[]; + + static inline const char *zone_stat_name(enum zone_stat_item item) +diff --git a/include/linux/zswap.h b/include/linux/zswap.h +index 2a60ce39cfde..a13d2d2d9131 100644 +--- a/include/linux/zswap.h ++++ b/include/linux/zswap.h +@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages; + + bool zswap_store(struct folio *folio); + bool zswap_load(struct folio *folio); +-void zswap_invalidate(int type, pgoff_t offset); ++void zswap_invalidate(swp_entry_t swp); + void zswap_swapon(int type); + void zswap_swapoff(int type); + +@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio) + return false; + } + +-static inline void zswap_invalidate(int type, pgoff_t offset) {} ++static inline void zswap_invalidate(swp_entry_t swp) {} + static inline void zswap_swapon(int type) {} + static inline void zswap_swapoff(int type) {} + +diff --git a/include/net/flow.h b/include/net/flow.h +index 0cc5f2ef1000..72d2ea2374ba 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -46,6 +46,8 @@ struct flowi_common { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + union flowi_uli { +diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h +index 4b77a9b031b6..963588269637 100644 +--- a/include/net/netns/netfilter.h ++++ b/include/net/netns/netfilter.h +@@ -34,5 +34,7 @@ struct netns_nf { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + #endif +diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h +index a0c1359cc7eb..af7f20ef4823 100644 +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -87,6 +87,8 @@ struct netns_xfrm { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + #endif +diff --git a/include/net/xdp.h b/include/net/xdp.h +index c283668458ca..ebebadc56cd9 100644 +--- a/include/net/xdp.h ++++ b/include/net/xdp.h +@@ -54,6 +54,9 @@ enum xdp_mem_type { + struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; ++ ++ KABI_RESERVE(1); ++ KABI_RESERVE(2); + }; + + struct page_pool; +@@ -74,6 +77,9 @@ struct xdp_rxq_info { + + struct xdp_txq_info { + struct net_device *dev; ++ ++ KABI_RESERVE(1); ++ KABI_RESERVE(2); + }; + + enum xdp_buff_flags { +@@ -92,6 +98,11 @@ struct xdp_buff { + struct xdp_txq_info *txq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1); ++ KABI_RESERVE(2); ++ KABI_RESERVE(3); ++ KABI_RESERVE(4); + }; + + static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +@@ -181,6 +192,11 @@ struct xdp_frame { + struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1); ++ KABI_RESERVE(2); ++ KABI_RESERVE(3); ++ KABI_RESERVE(4); + }; + + static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +@@ -198,6 +214,9 @@ struct xdp_frame_bulk { + int count; + void *xa; + void *q[XDP_BULK_QUEUE_SIZE]; ++ ++ KABI_RESERVE(1); ++ KABI_RESERVE(2); + }; + + static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index c875faf98492..b9dec5f9c973 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -294,6 +294,8 @@ struct xfrm_state { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xs_net(struct xfrm_state *x) +@@ -562,6 +564,8 @@ struct xfrm_policy { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xp_net(const struct xfrm_policy *xp) +diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h +index 482647774bf5..a660cb68c853 100644 +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -6573,6 +6573,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index 96a9bd2c26f0..f5fb12890645 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; + + /* iterate across the hierarchies */ + #define for_each_root(root) \ +- list_for_each_entry((root), &cgroup_roots, root_list) ++ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ ++ lockdep_is_held(&cgroup_mutex)) + + /** + * for_each_subsys - iterate all enabled cgroup subsystems +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index 52fe6ba2fefd..c26a9b3a3576 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) + + void cgroup_free_root(struct cgroup_root *root) + { +- kfree(root); ++ kfree_rcu(root, rcu); + } + + static void cgroup_destroy_root(struct cgroup_root *root) +@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) + spin_unlock_irq(&css_set_lock); + + if (!list_empty(&root->root_list)) { +- list_del(&root->root_list); ++ list_del_rcu(&root->root_list); + cgroup_root_count--; + } + +@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + } + } + +- BUG_ON(!res_cgroup); ++ /* ++ * If cgroup_mutex is not held, the cgrp_cset_link will be freed ++ * before we remove the cgroup root from the root_list. Consequently, ++ * when accessing a cgroup root, the cset_link may have already been ++ * freed, resulting in a NULL res_cgroup. However, by holding the ++ * cgroup_mutex, we ensure that res_cgroup can't be NULL. ++ * If we don't hold cgroup_mutex in the caller, we must do the NULL ++ * check. ++ */ + return res_cgroup; + } + +@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) + static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) + { +- lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_lock); + + return __cset_cgroup_from_root(cset, root); +@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + + /* + * Return the cgroup for "task" from the given hierarchy. Must be +- * called with cgroup_mutex and css_set_lock held. ++ * called with css_set_lock held to prevent task's groups from being modified. ++ * Must be called with either cgroup_mutex or rcu read lock to prevent the ++ * cgroup root from being destroyed. + */ + struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) +@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) + struct cgroup_root *root = ctx->root; + struct cgroup *cgrp = &root->cgrp; + +- INIT_LIST_HEAD(&root->root_list); ++ INIT_LIST_HEAD_RCU(&root->root_list); + atomic_set(&root->nr_cgrps, 1); + cgrp->root = root; + init_cgroup_housekeeping(cgrp); +@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ +- list_add(&root->root_list, &cgroup_roots); ++ list_add_rcu(&root->root_list, &cgroup_roots); + cgroup_root_count++; + + /* +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 2c9e50f09fc1..7ea0a6d00519 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -21,6 +21,7 @@ + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ ++#include "cgroup-internal.h" + + #include + #include +@@ -210,11 +211,6 @@ struct cpuset { + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; +- +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) + }; + + /* +@@ -5185,40 +5181,20 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + char *buf; + struct cgroup_subsys_state *css; + int retval; +- struct cgroup *root_cgroup = NULL; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + +- css = task_get_css(tsk, cpuset_cgrp_id); + rcu_read_lock(); +- /* +- * When the cpuset subsystem is mounted on the legacy hierarchy, +- * the top_cpuset.css->cgroup does not hold a reference count of +- * cgroup_root.cgroup. This makes accessing css->cgroup very +- * dangerous because when the cpuset subsystem is remounted to the +- * default hierarchy, the cgroup_root.cgroup that css->cgroup points +- * to will be released, leading to a UAF issue. To avoid this problem, +- * get the reference count of top_cpuset.css->cgroup first. +- * +- * This is ugly!! +- */ +- if (css == &top_cpuset.css) { +- root_cgroup = css->cgroup; +- if (!css_tryget_online(&root_cgroup->self)) { +- rcu_read_unlock(); +- retval = -EBUSY; +- goto out_free; +- } +- } ++ spin_lock_irq(&css_set_lock); ++ css = task_css(tsk, cpuset_cgrp_id); ++ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, ++ current->nsproxy->cgroup_ns); ++ spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); +- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, +- current->nsproxy->cgroup_ns); +- css_put(css); +- if (root_cgroup) +- css_put(&root_cgroup->self); ++ + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) +diff --git a/kernel/events/internal.h b/kernel/events/internal.h +index d2e6e6144c54..d1ffa00b91b6 100644 +--- a/kernel/events/internal.h ++++ b/kernel/events/internal.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + /* Buffer handling */ + +@@ -54,6 +55,15 @@ struct perf_buffer { + void **aux_pages; + void *aux_priv; + ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) ++ + struct perf_event_mmap_page *user_page; + void *data_pages[]; + }; +diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c +index f9d145730fd1..03a810927d0a 100644 +--- a/mm/debug_page_alloc.c ++++ b/mm/debug_page_alloc.c +@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) + } + early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype) ++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) + { + if (order >= debug_guardpage_minorder()) + return false; +@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); +- /* Guard pages are not available for any usage */ +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; + } + +-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype) ++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) + { + __ClearPageGuard(page); +- + set_page_private(page, 0); +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, (1 << order), migratetype); + } +diff --git a/mm/internal.h b/mm/internal.h +index 0478e5dab55b..8742aafde387 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -693,10 +693,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); + +- +-int split_free_page(struct page *free_page, +- unsigned int order, unsigned long split_pfn_offset); +- + #if defined CONFIG_COMPACTION || defined CONFIG_CMA + + #define MAX_PAGE_ORDER MAX_ORDER +@@ -1175,11 +1171,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) + return migratetype == MIGRATE_HIGHATOMIC; + } + +-static inline bool is_migrate_highatomic_page(struct page *page) +-{ +- return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +-} +- + void setup_zone_pageset(struct zone *zone); + + struct migration_target_control { +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 36cd38df0614..7734245d7870 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -207,24 +207,6 @@ EXPORT_SYMBOL(node_states); + + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; + +-/* +- * A cached value of the page's pageblock's migratetype, used when the page is +- * put on a pcplist. Used to avoid the pageblock migratetype lookup when +- * freeing from pcplists in most cases, at the cost of possibly becoming stale. +- * Also the migratetype set in the page does not necessarily match the pcplist +- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any +- * other index - this ensures that it will be put on the correct CMA freelist. +- */ +-static inline int get_pcppage_migratetype(struct page *page) +-{ +- return page->index; +-} +- +-static inline void set_pcppage_migratetype(struct page *page, int migratetype) +-{ +- page->index = migratetype; +-} +- + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + unsigned int pageblock_order __read_mostly; + #endif +@@ -654,23 +636,38 @@ compaction_capture(struct capture_control *capc, struct page *page, + } + #endif /* CONFIG_COMPACTION */ + +-/* Used for pages not on another list */ +-static inline void add_to_free_list(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++static inline void account_freepages(struct zone *zone, int nr_pages, ++ int migratetype) + { +- struct free_area *area = &zone->free_area[order]; ++ lockdep_assert_held(&zone->lock); + +- list_add(&page->buddy_list, &area->free_list[migratetype]); +- area->nr_free++; ++ if (is_migrate_isolate(migratetype)) ++ return; ++ ++ __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); ++ ++ if (is_migrate_cma(migratetype)) ++ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); ++ else if (is_migrate_highatomic(migratetype)) ++ WRITE_ONCE(zone->nr_free_highatomic, ++ zone->nr_free_highatomic + nr_pages); + } + + /* Used for pages not on another list */ +-static inline void add_to_free_list_tail(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++static inline void __add_to_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype, ++ bool tail) + { + struct free_area *area = &zone->free_area[order]; + +- list_add_tail(&page->buddy_list, &area->free_list[migratetype]); ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), migratetype, 1 << order); ++ ++ if (tail) ++ list_add_tail(&page->buddy_list, &area->free_list[migratetype]); ++ else ++ list_add(&page->buddy_list, &area->free_list[migratetype]); + area->nr_free++; + } + +@@ -680,16 +677,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + * allocation again (e.g., optimization for memory onlining). + */ + static inline void move_to_free_list(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++ unsigned int order, int old_mt, int new_mt) + { + struct free_area *area = &zone->free_area[order]; + +- list_move_tail(&page->buddy_list, &area->free_list[migratetype]); ++ /* Free page moving can fail, so it happens before the type update */ ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), old_mt, 1 << order); ++ ++ list_move_tail(&page->buddy_list, &area->free_list[new_mt]); ++ ++ account_freepages(zone, -(1 << order), old_mt); ++ account_freepages(zone, 1 << order, new_mt); + } + +-static inline void del_page_from_free_list(struct page *page, struct zone *zone, +- unsigned int order) ++static inline void __del_page_from_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype) + { ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), migratetype, 1 << order); ++ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); +@@ -700,6 +709,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, + zone->free_area[order].nr_free--; + } + ++static inline void del_page_from_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype) ++{ ++ __del_page_from_free_list(page, zone, order, migratetype); ++ account_freepages(zone, -(1 << order), migratetype); ++} ++ + static inline struct page *get_page_from_free_area(struct free_area *area, + int migratetype) + { +@@ -771,16 +787,16 @@ static inline void __free_one_page(struct page *page, + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + + VM_BUG_ON(migratetype == -1); +- if (likely(!is_migrate_isolate(migratetype))) +- __mod_zone_freepage_state(zone, 1 << order, migratetype); +- + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + ++ account_freepages(zone, 1 << order, migratetype); ++ + while (order < MAX_ORDER) { ++ int buddy_mt = migratetype; ++ + if (compaction_capture(capc, page, order, migratetype)) { +- __mod_zone_freepage_state(zone, -(1 << order), +- migratetype); ++ account_freepages(zone, -(1 << order), migratetype); + return; + } + +@@ -795,11 +811,11 @@ static inline void __free_one_page(struct page *page, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ +- int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); ++ buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + +- if (migratetype != buddy_mt +- && (!migratetype_is_mergeable(migratetype) || +- !migratetype_is_mergeable(buddy_mt))) ++ if (migratetype != buddy_mt && ++ (!migratetype_is_mergeable(migratetype) || ++ !migratetype_is_mergeable(buddy_mt))) + goto done_merging; + } + +@@ -808,9 +824,19 @@ static inline void __free_one_page(struct page *page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) +- clear_page_guard(zone, buddy, order, migratetype); ++ clear_page_guard(zone, buddy, order); + else +- del_page_from_free_list(buddy, zone, order); ++ __del_page_from_free_list(buddy, zone, order, buddy_mt); ++ ++ if (unlikely(buddy_mt != migratetype)) { ++ /* ++ * Match buddy type. This ensures that an ++ * expand() down the line puts the sub-blocks ++ * on the right freelists. ++ */ ++ set_pageblock_migratetype(buddy, migratetype); ++ } ++ + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; +@@ -827,74 +853,13 @@ static inline void __free_one_page(struct page *page, + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + +- if (to_tail) +- add_to_free_list_tail(page, zone, order, migratetype); +- else +- add_to_free_list(page, zone, order, migratetype); ++ __add_to_free_list(page, zone, order, migratetype, to_tail); + + /* Notify page reporting subsystem of freed page */ + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) + page_reporting_notify_free(order); + } + +-/** +- * split_free_page() -- split a free page at split_pfn_offset +- * @free_page: the original free page +- * @order: the order of the page +- * @split_pfn_offset: split offset within the page +- * +- * Return -ENOENT if the free page is changed, otherwise 0 +- * +- * It is used when the free page crosses two pageblocks with different migratetypes +- * at split_pfn_offset within the page. The split free page will be put into +- * separate migratetype lists afterwards. Otherwise, the function achieves +- * nothing. +- */ +-int split_free_page(struct page *free_page, +- unsigned int order, unsigned long split_pfn_offset) +-{ +- struct zone *zone = page_zone(free_page); +- unsigned long free_page_pfn = page_to_pfn(free_page); +- unsigned long pfn; +- unsigned long flags; +- int free_page_order; +- int mt; +- int ret = 0; +- +- if (split_pfn_offset == 0) +- return ret; +- +- spin_lock_irqsave(&zone->lock, flags); +- +- if (!PageBuddy(free_page) || buddy_order(free_page) != order) { +- ret = -ENOENT; +- goto out; +- } +- +- mt = get_pfnblock_migratetype(free_page, free_page_pfn); +- if (likely(!is_migrate_isolate(mt))) +- __mod_zone_freepage_state(zone, -(1UL << order), mt); +- +- del_page_from_free_list(free_page, zone, order); +- for (pfn = free_page_pfn; +- pfn < free_page_pfn + (1UL << order);) { +- int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); +- +- free_page_order = min_t(unsigned int, +- pfn ? __ffs(pfn) : order, +- __fls(split_pfn_offset)); +- __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, +- mt, FPI_NONE); +- pfn += 1UL << free_page_order; +- split_pfn_offset -= (1UL << free_page_order); +- /* we have done the first part, now switch to second part */ +- if (split_pfn_offset == 0) +- split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); +- } +-out: +- spin_unlock_irqrestore(&zone->lock, flags); +- return ret; +-} + /* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed +@@ -1186,7 +1151,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + { + unsigned long flags; + unsigned int order; +- bool isolated_pageblocks; + struct page *page; + + /* +@@ -1199,7 +1163,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + pindex = pindex - 1; + + spin_lock_irqsave(&zone->lock, flags); +- isolated_pageblocks = has_isolate_pageblock(zone); + + while (count > 0) { + struct list_head *list; +@@ -1215,23 +1178,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, + order = pindex_to_order(pindex); + nr_pages = 1 << order; + do { ++ unsigned long pfn; + int mt; + + page = list_last_entry(list, struct page, pcp_list); +- mt = get_pcppage_migratetype(page); ++ pfn = page_to_pfn(page); ++ mt = get_pfnblock_migratetype(page, pfn); + + /* must delete to avoid corrupting pcp list */ + list_del(&page->pcp_list); + count -= nr_pages; + pcp->count -= nr_pages; + +- /* MIGRATE_ISOLATE page should not go to pcplists */ +- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); +- /* Pageblock could have been isolated meanwhile */ +- if (unlikely(isolated_pageblocks)) +- mt = get_pageblock_migratetype(page); +- +- __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); ++ __free_one_page(page, pfn, zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); + } +@@ -1239,18 +1198,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, + spin_unlock_irqrestore(&zone->lock, flags); + } + +-static void free_one_page(struct zone *zone, +- struct page *page, unsigned long pfn, +- unsigned int order, +- int migratetype, fpi_t fpi_flags) ++static void free_one_page(struct zone *zone, struct page *page, ++ unsigned long pfn, unsigned int order, ++ fpi_t fpi_flags) + { + unsigned long flags; ++ int migratetype; + + spin_lock_irqsave(&zone->lock, flags); +- if (unlikely(has_isolate_pageblock(zone) || +- is_migrate_isolate(migratetype))) { +- migratetype = get_pfnblock_migratetype(page, pfn); +- } ++ migratetype = get_pfnblock_migratetype(page, pfn); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); + } +@@ -1258,28 +1214,13 @@ static void free_one_page(struct zone *zone, + static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) + { +- unsigned long flags; +- int migratetype; + unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); + + if (!free_pages_prepare(page, order)) + return; + +- /* +- * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here +- * is used to avoid calling get_pfnblock_migratetype() under the lock. +- * This will reduce the lock holding time. +- */ +- migratetype = get_pfnblock_migratetype(page, pfn); +- +- spin_lock_irqsave(&zone->lock, flags); +- if (unlikely(has_isolate_pageblock(zone) || +- is_migrate_isolate(migratetype))) { +- migratetype = get_pfnblock_migratetype(page, pfn); +- } +- __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); +- spin_unlock_irqrestore(&zone->lock, flags); ++ free_one_page(zone, page, pfn, order, fpi_flags); + + __count_vm_events(PGFREE, 1 << order); + } +@@ -1386,10 +1327,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + * + * -- nyc + */ +-static inline void expand(struct zone *zone, struct page *page, +- int low, int high, int migratetype) ++static inline unsigned int expand(struct zone *zone, struct page *page, int low, ++ int high, int migratetype) + { +- unsigned long size = 1 << high; ++ unsigned int size = 1 << high; ++ unsigned int nr_added = 0; + + while (high > low) { + high--; +@@ -1402,12 +1344,26 @@ static inline void expand(struct zone *zone, struct page *page, + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ +- if (set_page_guard(zone, &page[size], high, migratetype)) ++ if (set_page_guard(zone, &page[size], high)) + continue; + +- add_to_free_list(&page[size], zone, high, migratetype); ++ __add_to_free_list(&page[size], zone, high, migratetype, false); + set_buddy_order(&page[size], high); ++ nr_added += size; + } ++ ++ return nr_added; ++} ++ ++static __always_inline void page_del_and_expand(struct zone *zone, ++ struct page *page, int low, ++ int high, int migratetype) ++{ ++ int nr_pages = 1 << high; ++ ++ __del_page_from_free_list(page, zone, high, migratetype); ++ nr_pages -= expand(zone, page, low, high, migratetype); ++ account_freepages(zone, -nr_pages, migratetype); + } + + static void check_new_page_bad(struct page *page) +@@ -1596,9 +1552,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + page = get_page_from_free_area(area, migratetype); + if (!page) + continue; +- del_page_from_free_list(page, zone, current_order); +- expand(zone, page, order, current_order, migratetype); +- set_pcppage_migratetype(page, migratetype); ++ ++ page_del_and_expand(zone, page, order, current_order, ++ migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); +@@ -1633,30 +1589,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + #endif + + /* +- * Move the free pages in a range to the freelist tail of the requested type. +- * Note that start_page and end_pages are not aligned on a pageblock +- * boundary. If alignment is required, use move_freepages_block() ++ * Change the type of a block and move all its free pages to that ++ * type's freelist. + */ +-static int move_freepages(struct zone *zone, +- unsigned long start_pfn, unsigned long end_pfn, +- int migratetype, int *num_movable) ++static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, ++ int old_mt, int new_mt) + { + struct page *page; +- unsigned long pfn; ++ unsigned long pfn, end_pfn; + unsigned int order; + int pages_moved = 0; + +- for (pfn = start_pfn; pfn <= end_pfn;) { ++ VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); ++ end_pfn = pageblock_end_pfn(start_pfn); ++ ++ for (pfn = start_pfn; pfn < end_pfn;) { + page = pfn_to_page(pfn); + if (!PageBuddy(page)) { +- /* +- * We assume that pages that could be isolated for +- * migration are movable. But we don't actually try +- * isolating, as that would be expensive. +- */ +- if (num_movable && +- (PageLRU(page) || __PageMovable(page))) +- (*num_movable)++; + pfn++; + continue; + } +@@ -1666,35 +1615,186 @@ static int move_freepages(struct zone *zone, + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + + order = buddy_order(page); +- move_to_free_list(page, zone, order, migratetype); ++ ++ move_to_free_list(page, zone, order, old_mt, new_mt); ++ + pfn += 1 << order; + pages_moved += 1 << order; + } + ++ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); ++ + return pages_moved; + } + +-int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype, int *num_movable) ++static bool prep_move_freepages_block(struct zone *zone, struct page *page, ++ unsigned long *start_pfn, ++ int *num_free, int *num_movable) + { +- unsigned long start_pfn, end_pfn, pfn; ++ unsigned long pfn, start, end; ++ ++ pfn = page_to_pfn(page); ++ start = pageblock_start_pfn(pfn); ++ end = pageblock_end_pfn(pfn); ++ ++ /* ++ * The caller only has the lock for @zone, don't touch ranges ++ * that straddle into other zones. While we could move part of ++ * the range that's inside the zone, this call is usually ++ * accompanied by other operations such as migratetype updates ++ * which also should be locked. ++ */ ++ if (!zone_spans_pfn(zone, start)) ++ return false; ++ if (!zone_spans_pfn(zone, end - 1)) ++ return false; ++ ++ *start_pfn = start; + +- if (num_movable) ++ if (num_free) { ++ *num_free = 0; + *num_movable = 0; ++ for (pfn = start; pfn < end;) { ++ page = pfn_to_page(pfn); ++ if (PageBuddy(page)) { ++ int nr = 1 << buddy_order(page); + +- pfn = page_to_pfn(page); +- start_pfn = pageblock_start_pfn(pfn); +- end_pfn = pageblock_end_pfn(pfn) - 1; ++ *num_free += nr; ++ pfn += nr; ++ continue; ++ } ++ /* ++ * We assume that pages that could be isolated for ++ * migration are movable. But we don't actually try ++ * isolating, as that would be expensive. ++ */ ++ if (PageLRU(page) || __PageMovable(page)) ++ (*num_movable)++; ++ pfn++; ++ } ++ } + +- /* Do not cross zone boundaries */ +- if (!zone_spans_pfn(zone, start_pfn)) +- start_pfn = pfn; +- if (!zone_spans_pfn(zone, end_pfn)) +- return 0; ++ return true; ++} ++ ++static int move_freepages_block(struct zone *zone, struct page *page, ++ int old_mt, int new_mt) ++{ ++ unsigned long start_pfn; + +- return move_freepages(zone, start_pfn, end_pfn, migratetype, +- num_movable); ++ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) ++ return -1; ++ ++ return __move_freepages_block(zone, start_pfn, old_mt, new_mt); ++} ++ ++#ifdef CONFIG_MEMORY_ISOLATION ++/* Look for a buddy that straddles start_pfn */ ++static unsigned long find_large_buddy(unsigned long start_pfn) ++{ ++ int order = 0; ++ struct page *page; ++ unsigned long pfn = start_pfn; ++ ++ while (!PageBuddy(page = pfn_to_page(pfn))) { ++ /* Nothing found */ ++ if (++order > MAX_PAGE_ORDER) ++ return start_pfn; ++ pfn &= ~0UL << order; ++ } ++ ++ /* ++ * Found a preceding buddy, but does it straddle? ++ */ ++ if (pfn + (1 << buddy_order(page)) > start_pfn) ++ return pfn; ++ ++ /* Nothing found */ ++ return start_pfn; ++} ++ ++/* Split a multi-block free page into its individual pageblocks */ ++static void split_large_buddy(struct zone *zone, struct page *page, ++ unsigned long pfn, int order) ++{ ++ unsigned long end_pfn = pfn + (1 << order); ++ ++ VM_WARN_ON_ONCE(order <= pageblock_order); ++ VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); ++ ++ /* Caller removed page from freelist, buddy info cleared! */ ++ VM_WARN_ON_ONCE(PageBuddy(page)); ++ ++ while (pfn != end_pfn) { ++ int mt = get_pfnblock_migratetype(page, pfn); ++ ++ __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); ++ pfn += pageblock_nr_pages; ++ page = pfn_to_page(pfn); ++ } ++} ++ ++/** ++ * move_freepages_block_isolate - move free pages in block for page isolation ++ * @zone: the zone ++ * @page: the pageblock page ++ * @migratetype: migratetype to set on the pageblock ++ * ++ * This is similar to move_freepages_block(), but handles the special ++ * case encountered in page isolation, where the block of interest ++ * might be part of a larger buddy spanning multiple pageblocks. ++ * ++ * Unlike the regular page allocator path, which moves pages while ++ * stealing buddies off the freelist, page isolation is interested in ++ * arbitrary pfn ranges that may have overlapping buddies on both ends. ++ * ++ * This function handles that. Straddling buddies are split into ++ * individual pageblocks. Only the block of interest is moved. ++ * ++ * Returns %true if pages could be moved, %false otherwise. ++ */ ++bool move_freepages_block_isolate(struct zone *zone, struct page *page, ++ int migratetype) ++{ ++ unsigned long start_pfn, pfn; ++ ++ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) ++ return false; ++ ++ /* No splits needed if buddies can't span multiple blocks */ ++ if (pageblock_order == MAX_PAGE_ORDER) ++ goto move; ++ ++ /* We're a tail block in a larger buddy */ ++ pfn = find_large_buddy(start_pfn); ++ if (pfn != start_pfn) { ++ struct page *buddy = pfn_to_page(pfn); ++ int order = buddy_order(buddy); ++ ++ del_page_from_free_list(buddy, zone, order, ++ get_pfnblock_migratetype(buddy, pfn)); ++ set_pageblock_migratetype(page, migratetype); ++ split_large_buddy(zone, buddy, pfn, order); ++ return true; ++ } ++ ++ /* We're the starting block of a larger buddy */ ++ if (PageBuddy(page) && buddy_order(page) > pageblock_order) { ++ int order = buddy_order(page); ++ ++ del_page_from_free_list(page, zone, order, ++ get_pfnblock_migratetype(page, pfn)); ++ set_pageblock_migratetype(page, migratetype); ++ split_large_buddy(zone, page, pfn, order); ++ return true; ++ } ++move: ++ __move_freepages_block(zone, start_pfn, ++ get_pfnblock_migratetype(page, start_pfn), ++ migratetype); ++ return true; + } ++#endif /* CONFIG_MEMORY_ISOLATION */ + + static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +@@ -1778,33 +1878,40 @@ static inline bool boost_watermark(struct zone *zone) + } + + /* +- * This function implements actual steal behaviour. If order is large enough, +- * we can steal whole pageblock. If not, we first move freepages in this +- * pageblock to our migratetype and determine how many already-allocated pages +- * are there in the pageblock with a compatible migratetype. If at least half +- * of pages are free or compatible, we can change migratetype of the pageblock +- * itself, so pages freed in the future will be put on the correct free list. ++ * This function implements actual steal behaviour. If order is large enough, we ++ * can claim the whole pageblock for the requested migratetype. If not, we check ++ * the pageblock for constituent pages; if at least half of the pages are free ++ * or compatible, we can still claim the whole block, so pages freed in the ++ * future will be put on the correct free list. Otherwise, we isolate exactly ++ * the order we need from the fallback block and leave its migratetype alone. + */ +-static void steal_suitable_fallback(struct zone *zone, struct page *page, +- unsigned int alloc_flags, int start_type, bool whole_block) ++static struct page * ++steal_suitable_fallback(struct zone *zone, struct page *page, ++ int current_order, int order, int start_type, ++ unsigned int alloc_flags, bool whole_block) + { +- unsigned int current_order = buddy_order(page); + int free_pages, movable_pages, alike_pages; +- int old_block_type; ++ unsigned long start_pfn; ++ int block_type; + +- old_block_type = get_pageblock_migratetype(page); ++ block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ +- if (is_migrate_highatomic(old_block_type)) ++ if (is_migrate_highatomic(block_type)) + goto single_page; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { ++ unsigned int nr_added; ++ ++ del_page_from_free_list(page, zone, current_order, block_type); + change_pageblock_range(page, current_order, start_type); +- goto single_page; ++ nr_added = expand(zone, page, order, current_order, start_type); ++ account_freepages(zone, nr_added, start_type); ++ return page; + } + + /* +@@ -1819,10 +1926,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + if (!whole_block) + goto single_page; + +- free_pages = move_freepages_block(zone, page, start_type, +- &movable_pages); + /* moving whole block can fail due to zone boundary conditions */ +- if (!free_pages) ++ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, ++ &movable_pages)) + goto single_page; + + /* +@@ -1840,7 +1946,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ +- if (old_block_type == MIGRATE_MOVABLE) ++ if (block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else +@@ -1851,13 +1957,14 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + * compatible migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || +- page_group_by_mobility_disabled) +- set_pageblock_migratetype(page, start_type); +- +- return; ++ page_group_by_mobility_disabled) { ++ __move_freepages_block(zone, start_pfn, block_type, start_type); ++ return __rmqueue_smallest(zone, order, start_type); ++ } + + single_page: +- move_to_free_list(page, zone, current_order, start_type); ++ page_del_and_expand(zone, page, order, current_order, block_type); ++ return page; + } + + /* +@@ -1895,10 +2002,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, + } + + /* +- * Reserve a pageblock for exclusive use of high-order atomic allocations if +- * there are no empty page blocks that contain a page with a suitable order ++ * Reserve the pageblock(s) surrounding an allocation request for ++ * exclusive use of high-order atomic allocations if there are no ++ * empty page blocks that contain a page with a suitable order + */ +-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) ++static void reserve_highatomic_pageblock(struct page *page, int order, ++ struct zone *zone) + { + int mt; + unsigned long max_managed, flags; +@@ -1924,10 +2033,16 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ +- if (migratetype_is_mergeable(mt)) { ++ if (!migratetype_is_mergeable(mt)) ++ goto out_unlock; ++ ++ if (order < pageblock_order) { ++ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) ++ goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; +- set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); +- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); ++ } else { ++ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); ++ zone->nr_reserved_highatomic += 1 << order; + } + + out_unlock: +@@ -1940,7 +2055,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * +- * If @force is true, try to unreserve a pageblock even though highatomic ++ * If @force is true, try to unreserve pageblocks even though highatomic + * pageblock is exhausted. + */ + static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, +@@ -1952,7 +2067,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + struct zone *zone; + struct page *page; + int order; +- bool ret; ++ int ret; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { +@@ -1967,11 +2082,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); ++ int mt; + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + ++ mt = get_pageblock_migratetype(page); + /* + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock +@@ -1979,7 +2096,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * from highatomic to ac->migratetype. So we should + * adjust the count once. + */ +- if (is_migrate_highatomic_page(page)) { ++ if (is_migrate_highatomic(mt)) { ++ unsigned long size; + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu +@@ -1987,9 +2105,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * while unreserving so be safe and watch for + * underflows. + */ +- zone->nr_reserved_highatomic -= min( +- pageblock_nr_pages, +- zone->nr_reserved_highatomic); ++ size = max(pageblock_nr_pages, 1UL << order); ++ size = min(size, zone->nr_reserved_highatomic); ++ zone->nr_reserved_highatomic -= size; + } + + /* +@@ -2001,10 +2119,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * of pageblocks that cannot be completely freed + * may increase. + */ +- set_pageblock_migratetype(page, ac->migratetype); +- ret = move_freepages_block(zone, page, ac->migratetype, +- NULL); +- if (ret) { ++ if (order < pageblock_order) ++ ret = move_freepages_block(zone, page, mt, ++ ac->migratetype); ++ else { ++ move_to_free_list(page, zone, order, mt, ++ ac->migratetype); ++ change_pageblock_range(page, order, ++ ac->migratetype); ++ ret = 1; ++ } ++ /* ++ * Reserving the block(s) already succeeded, ++ * so this should not fail on zone boundaries. ++ */ ++ WARN_ON_ONCE(ret == -1); ++ if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } +@@ -2025,7 +2155,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * deviation from the rest of this file, to make the for loop + * condition simpler. + */ +-static __always_inline bool ++static __always_inline struct page * + __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) + { +@@ -2072,7 +2202,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + goto do_steal; + } + +- return false; ++ return NULL; + + find_smallest: + for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { +@@ -2092,14 +2222,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + do_steal: + page = get_page_from_free_area(area, fallback_mt); + +- steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, +- can_steal); ++ /* take off list, maybe claim block, expand remainder */ ++ page = steal_suitable_fallback(zone, page, current_order, order, ++ start_migratetype, alloc_flags, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + +- return true; +- ++ return page; + } + + /* +@@ -2126,15 +2256,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, + return page; + } + } +-retry: ++ + page = __rmqueue_smallest(zone, order, migratetype); + if (unlikely(!page)) { + if (alloc_flags & ALLOC_CMA) + page = __rmqueue_cma_fallback(zone, order); + +- if (!page && __rmqueue_fallback(zone, order, migratetype, +- alloc_flags)) +- goto retry; ++ if (!page) ++ page = __rmqueue_fallback(zone, order, migratetype, ++ alloc_flags); + } + return page; + } +@@ -2169,12 +2299,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); +- if (is_migrate_cma(get_pcppage_migratetype(page))) +- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, +- -(1 << order)); + } +- +- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock_irqrestore(&zone->lock, flags); + + return i; +@@ -2369,19 +2494,6 @@ void drain_all_pages(struct zone *zone) + __drain_all_pages(zone, false); + } + +-static bool free_unref_page_prepare(struct page *page, unsigned long pfn, +- unsigned int order) +-{ +- int migratetype; +- +- if (!free_pages_prepare(page, order)) +- return false; +- +- migratetype = get_pfnblock_migratetype(page, pfn); +- set_pcppage_migratetype(page, migratetype); +- return true; +-} +- + static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) + { + int min_nr_free, max_nr_free; +@@ -2512,7 +2624,7 @@ void free_unref_page(struct page *page, unsigned int order) + struct per_cpu_pages *pcp; + struct zone *zone; + unsigned long pfn = page_to_pfn(page); +- int migratetype, pcpmigratetype; ++ int migratetype; + + if (page_from_dynamic_pool(page)) { + dynamic_pool_free_page(page); +@@ -2524,7 +2636,7 @@ void free_unref_page(struct page *page, unsigned int order) + return; + } + +- if (!free_unref_page_prepare(page, pfn, order)) ++ if (!free_pages_prepare(page, order)) + return; + + /* +@@ -2534,23 +2646,23 @@ void free_unref_page(struct page *page, unsigned int order) + * get those areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ +- migratetype = pcpmigratetype = get_pcppage_migratetype(page); ++ migratetype = get_pfnblock_migratetype(page, pfn); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { +- free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); ++ free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + return; + } +- pcpmigratetype = MIGRATE_MOVABLE; ++ migratetype = MIGRATE_MOVABLE; + } + + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (pcp) { +- free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); ++ free_unref_page_commit(zone, pcp, page, migratetype, order); + pcp_spin_unlock(pcp); + } else { +- free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); ++ free_one_page(zone, page, pfn, order, FPI_NONE); + } + pcp_trylock_finish(UP_flags); + } +@@ -2563,7 +2675,7 @@ void free_unref_folios(struct folio_batch *folios) + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; +- int i, j, migratetype; ++ int i, j; + + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { +@@ -2577,18 +2689,15 @@ void free_unref_folios(struct folio_batch *folios) + } + + folio_undo_large_rmappable(folio); +- if (!free_unref_page_prepare(&folio->page, pfn, order)) ++ if (!free_pages_prepare(&folio->page, order)) + continue; +- + /* +- * Free isolated folios and orders not handled on the PCP +- * directly to the allocator, see comment in free_unref_page. ++ * Free orders not handled on the PCP directly to the ++ * allocator. + */ +- migratetype = get_pcppage_migratetype(&folio->page); +- if (!pcp_allowed_order(order) || +- is_migrate_isolate(migratetype)) { +- free_one_page(folio_zone(folio), &folio->page, pfn, +- order, migratetype, FPI_NONE); ++ if (!pcp_allowed_order(order)) { ++ free_one_page(folio_zone(folio), &folio->page, ++ pfn, order, FPI_NONE); + continue; + } + folio->private = (void *)(unsigned long)order; +@@ -2601,16 +2710,31 @@ void free_unref_folios(struct folio_batch *folios) + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + struct zone *zone = folio_zone(folio); ++ unsigned long pfn = folio_pfn(folio); + unsigned int order = (unsigned long)folio->private; ++ int migratetype; + + folio->private = NULL; +- migratetype = get_pcppage_migratetype(&folio->page); ++ migratetype = get_pfnblock_migratetype(&folio->page, pfn); + + /* Different zone requires a different pcp lock */ +- if (zone != locked_zone) { ++ if (zone != locked_zone || ++ is_migrate_isolate(migratetype)) { + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); ++ locked_zone = NULL; ++ pcp = NULL; ++ } ++ ++ /* ++ * Free isolated pages directly to the ++ * allocator, see comment in free_unref_page. ++ */ ++ if (is_migrate_isolate(migratetype)) { ++ free_one_page(zone, &folio->page, pfn, ++ order, FPI_NONE); ++ continue; + } + + /* +@@ -2621,10 +2745,8 @@ void free_unref_folios(struct folio_batch *folios) + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); +- free_one_page(zone, &folio->page, +- folio_pfn(folio), order, +- migratetype, FPI_NONE); +- locked_zone = NULL; ++ free_one_page(zone, &folio->page, pfn, ++ order, FPI_NONE); + continue; + } + locked_zone = zone; +@@ -2687,11 +2809,9 @@ int __isolate_free_page(struct page *page, unsigned int order) + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return 0; +- +- __mod_zone_freepage_state(zone, -(1UL << order), mt); + } + +- del_page_from_free_list(page, zone, order); ++ del_page_from_free_list(page, zone, order, mt); + + /* + * Set the pageblock if the isolated page is at least half of a +@@ -2706,8 +2826,8 @@ int __isolate_free_page(struct page *page, unsigned int order) + * with others) + */ + if (migratetype_is_mergeable(mt)) +- set_pageblock_migratetype(page, +- MIGRATE_MOVABLE); ++ move_freepages_block(zone, page, mt, ++ MIGRATE_MOVABLE); + } + } + +@@ -2791,8 +2911,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + return NULL; + } + } +- __mod_zone_freepage_state(zone, -(1 << order), +- get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + +@@ -2974,11 +3092,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z, + + /* + * If the caller does not have rights to reserves below the min +- * watermark then subtract the high-atomic reserves. This will +- * over-estimate the size of the atomic reserve but it avoids a search. ++ * watermark then subtract the free pages reserved for highatomic. + */ + if (likely(!(alloc_flags & ALLOC_RESERVES))) +- unusable_free += z->nr_reserved_highatomic; ++ unusable_free += READ_ONCE(z->nr_free_highatomic); + + #ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ +@@ -3360,7 +3477,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + * if the pageblock should be reserved for the future + */ + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) +- reserve_highatomic_pageblock(page, zone); ++ reserve_highatomic_pageblock(page, order, zone); + + return page; + } else { +@@ -6570,7 +6687,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype, gfp_t gfp_mask) + { + unsigned long outer_start, outer_end; +- int order; + int ret = 0; + + struct compact_control cc = { +@@ -6643,29 +6759,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ +- +- order = 0; +- outer_start = start; +- while (!PageBuddy(pfn_to_page(outer_start))) { +- if (++order > MAX_ORDER) { +- outer_start = start; +- break; +- } +- outer_start &= ~0UL << order; +- } +- +- if (outer_start != start) { +- order = buddy_order(pfn_to_page(outer_start)); +- +- /* +- * outer_start page could be small order buddy page and +- * it doesn't include start page. Adjust outer_start +- * in this case to report failed page properly +- * on tracepoint in test_pages_isolated() +- */ +- if (outer_start + (1UL << order) <= start) +- outer_start = start; +- } ++ outer_start = find_large_buddy(start); + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end, 0)) { +@@ -6899,8 +6993,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) + + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); ++ VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); + order = buddy_order(page); +- del_page_from_free_list(page, zone, order); ++ del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +@@ -6928,6 +7023,14 @@ bool is_free_buddy_page(struct page *page) + EXPORT_SYMBOL(is_free_buddy_page); + + #ifdef CONFIG_MEMORY_FAILURE ++static inline void add_to_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype, ++ bool tail) ++{ ++ __add_to_free_list(page, zone, order, migratetype, tail); ++ account_freepages(zone, 1 << order, migratetype); ++} ++ + /* + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. +@@ -6937,28 +7040,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, + int migratetype) + { + unsigned long size = 1 << high; +- struct page *current_buddy, *next_page; ++ struct page *current_buddy; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { +- next_page = page + size; + current_buddy = page; ++ page = page + size; + } else { +- next_page = page; + current_buddy = page + size; + } +- page = next_page; + +- if (set_page_guard(zone, current_buddy, high, migratetype)) ++ if (set_page_guard(zone, current_buddy, high)) + continue; + +- if (current_buddy != target) { +- add_to_free_list(current_buddy, zone, high, migratetype); +- set_buddy_order(current_buddy, high); +- } ++ add_to_free_list(current_buddy, zone, high, migratetype, false); ++ set_buddy_order(current_buddy, high); + } + } + +@@ -6983,12 +7082,11 @@ bool take_page_off_buddy(struct page *page) + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + +- del_page_from_free_list(page_head, zone, page_order); ++ del_page_from_free_list(page_head, zone, page_order, ++ migratetype); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + SetPageHWPoisonTakenOff(page); +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, -1, migratetype); + ret = true; + break; + } +@@ -7005,13 +7103,14 @@ bool take_page_off_buddy(struct page *page) + bool put_page_back_buddy(struct page *page) + { + struct zone *zone = page_zone(page); +- unsigned long pfn = page_to_pfn(page); + unsigned long flags; +- int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { ++ unsigned long pfn = page_to_pfn(page); ++ int migratetype = get_pfnblock_migratetype(page, pfn); ++ + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { +@@ -7092,7 +7191,7 @@ static bool try_to_accept_memory_one(struct zone *zone) + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + +- __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); ++ account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + +@@ -7150,7 +7249,7 @@ static bool __free_unaccepted(struct page *page) + spin_lock_irqsave(&zone->lock, flags); + first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); +- __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); ++ account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + +diff --git a/mm/page_isolation.c b/mm/page_isolation.c +index 03381be87b28..cf7f1922fc3e 100644 +--- a/mm/page_isolation.c ++++ b/mm/page_isolation.c +@@ -179,15 +179,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, + migratetype, isol_flags); + if (!unmovable) { +- unsigned long nr_pages; +- int mt = get_pageblock_migratetype(page); +- +- set_pageblock_migratetype(page, MIGRATE_ISOLATE); ++ if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { ++ spin_unlock_irqrestore(&zone->lock, flags); ++ return -EBUSY; ++ } + zone->nr_isolate_pageblock++; +- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, +- NULL); +- +- __mod_zone_freepage_state(zone, -nr_pages, mt); + spin_unlock_irqrestore(&zone->lock, flags); + return 0; + } +@@ -207,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + static void unset_migratetype_isolate(struct page *page, int migratetype) + { + struct zone *zone; +- unsigned long flags, nr_pages; ++ unsigned long flags; + bool isolated_page = false; + unsigned int order; + struct page *buddy; +@@ -253,12 +249,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) + * allocation. + */ + if (!isolated_page) { +- nr_pages = move_freepages_block(zone, page, migratetype, NULL); +- __mod_zone_freepage_state(zone, nr_pages, migratetype); +- } +- set_pageblock_migratetype(page, migratetype); +- if (isolated_page) ++ /* ++ * Isolating this block already succeeded, so this ++ * should not fail on zone boundaries. ++ */ ++ WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); ++ } else { ++ set_pageblock_migratetype(page, migratetype); + __putback_isolated_page(page, order, migratetype); ++ } + zone->nr_isolate_pageblock--; + out: + spin_unlock_irqrestore(&zone->lock, flags); +@@ -370,108 +369,52 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + + VM_BUG_ON(!page); + pfn = page_to_pfn(page); +- /* +- * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any +- * free pages in [start_pfn, boundary_pfn), its head page will +- * always be in the range. +- */ ++ + if (PageBuddy(page)) { + int order = buddy_order(page); + +- if (pfn + (1UL << order) > boundary_pfn) { +- /* free page changed before split, check it again */ +- if (split_free_page(page, order, boundary_pfn - pfn)) +- continue; +- } ++ /* move_freepages_block_isolate() handled this */ ++ VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); + + pfn += 1UL << order; + continue; + } ++ + /* +- * migrate compound pages then let the free page handling code +- * above do the rest. If migration is not possible, just fail. ++ * If a compound page is straddling our block, attempt ++ * to migrate it out of the way. ++ * ++ * We don't have to worry about this creating a large ++ * free page that straddles into our block: gigantic ++ * pages are freed as order-0 chunks, and LRU pages ++ * (currently) do not exceed pageblock_order. ++ * ++ * The block of interest has already been marked ++ * MIGRATE_ISOLATE above, so when migration is done it ++ * will free its pages onto the correct freelists. + */ + if (PageCompound(page)) { + struct page *head = compound_head(page); + unsigned long head_pfn = page_to_pfn(head); + unsigned long nr_pages = compound_nr(head); + +- if (head_pfn + nr_pages <= boundary_pfn) { ++ if (head_pfn + nr_pages <= boundary_pfn || ++ PageHuge(page)) { + pfn = head_pfn + nr_pages; + continue; + } +-#if defined CONFIG_COMPACTION || defined CONFIG_CMA ++ + /* +- * hugetlb, lru compound (THP), and movable compound pages +- * can be migrated. Otherwise, fail the isolation. ++ * These pages are movable too, but they're ++ * not expected to exceed pageblock_order. ++ * ++ * Let us know when they do, so we can add ++ * proper free and split handling for them. + */ +- if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { +- int order; +- unsigned long outer_pfn; +- int page_mt = get_pageblock_migratetype(page); +- bool isolate_page = !is_migrate_isolate_page(page); +- struct compact_control cc = { +- .nr_migratepages = 0, +- .order = -1, +- .zone = page_zone(pfn_to_page(head_pfn)), +- .mode = MIGRATE_SYNC, +- .ignore_skip_hint = true, +- .no_set_skip_hint = true, +- .gfp_mask = gfp_flags, +- .alloc_contig = true, +- }; +- INIT_LIST_HEAD(&cc.migratepages); +- +- /* +- * XXX: mark the page as MIGRATE_ISOLATE so that +- * no one else can grab the freed page after migration. +- * Ideally, the page should be freed as two separate +- * pages to be added into separate migratetype free +- * lists. +- */ +- if (isolate_page) { +- ret = set_migratetype_isolate(page, page_mt, +- flags, head_pfn, head_pfn + nr_pages); +- if (ret) +- goto failed; +- } +- +- ret = __alloc_contig_migrate_range(&cc, head_pfn, +- head_pfn + nr_pages, page_mt); ++ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); ++ VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); + +- /* +- * restore the page's migratetype so that it can +- * be split into separate migratetype free lists +- * later. +- */ +- if (isolate_page) +- unset_migratetype_isolate(page, page_mt); +- +- if (ret) +- goto failed; +- /* +- * reset pfn to the head of the free page, so +- * that the free page handling code above can split +- * the free page to the right migratetype list. +- * +- * head_pfn is not used here as a hugetlb page order +- * can be bigger than MAX_ORDER, but after it is +- * freed, the free page order is not. Use pfn within +- * the range to find the head of the free page. +- */ +- order = 0; +- outer_pfn = pfn; +- while (!PageBuddy(pfn_to_page(outer_pfn))) { +- /* stop if we cannot find the free page */ +- if (++order > MAX_ORDER) +- goto failed; +- outer_pfn &= ~0UL << order; +- } +- pfn = outer_pfn; +- continue; +- } else +-#endif +- goto failed; ++ goto failed; + } + + pfn++; +diff --git a/mm/readahead.c b/mm/readahead.c +index 438f142a3e74..c13c130efcca 100644 +--- a/mm/readahead.c ++++ b/mm/readahead.c +@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl, + + limit = min(limit, index + ra->size - 1); + +- if (new_order < MAX_PAGECACHE_ORDER) ++ if (new_order < mapping_max_folio_order(mapping)) + new_order += 2; + +- new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); ++ new_order = min(mapping_max_folio_order(mapping), new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + + /* See comment in page_cache_ra_unbounded() */ +diff --git a/mm/swap_slots.c b/mm/swap_slots.c +index 7af3b93d4c8c..5579eed7065f 100644 +--- a/mm/swap_slots.c ++++ b/mm/swap_slots.c +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); + #ifdef CONFIG_MEMCG_SWAP_QOS +@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry) + { + struct swap_slots_cache *cache; + ++ /* Large folio swap slot is not covered. */ ++ zswap_invalidate(entry); ++ + cache = raw_cpu_ptr(&swp_slots); + if (likely(use_swap_slot_cache && cache->slots_ret)) { + spin_lock_irq(&cache->free_lock); +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 3af5b6ebb241..3b48159820f2 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -52,6 +52,15 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); ++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, ++ unsigned int nr_pages); ++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, ++ unsigned int nr_entries); ++static bool folio_swapcache_freeable(struct folio *folio); ++static struct swap_cluster_info *lock_cluster_or_swap_info( ++ struct swap_info_struct *si, unsigned long offset); ++static void unlock_cluster_or_swap_info(struct swap_info_struct *si, ++ struct swap_cluster_info *ci); + + static DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -126,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent) + * corresponding page + */ + #define TTRS_UNMAPPED 0x2 +-/* Reclaim the swap entry if swap is getting full*/ ++/* Reclaim the swap entry if swap is getting full */ + #define TTRS_FULL 0x4 ++/* Reclaim directly, bypass the slot cache and don't touch device lock */ ++#define TTRS_DIRECT 0x8 ++ ++static bool swap_is_has_cache(struct swap_info_struct *si, ++ unsigned long offset, int nr_pages) ++{ ++ unsigned char *map = si->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ ++ do { ++ VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); ++ if (*map != SWAP_HAS_CACHE) ++ return false; ++ } while (++map < map_end); ++ ++ return true; ++} + + /* + * returns number of pages in the folio that backs the swap entry. If positive, +@@ -138,12 +164,19 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) + { + swp_entry_t entry = swp_entry(si->type, offset); ++ struct address_space *address_space = swap_address_space(entry); ++ struct swap_cluster_info *ci; + struct folio *folio; +- int ret = 0; ++ int ret, nr_pages; ++ bool need_reclaim; + +- folio = filemap_get_folio(swap_address_space(entry), offset); ++ folio = filemap_get_folio(address_space, offset); + if (IS_ERR(folio)) + return 0; ++ ++ nr_pages = folio_nr_pages(folio); ++ ret = -nr_pages; ++ + /* + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming folios. So we hold a folio lock +@@ -151,14 +184,54 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + * case and you should use folio_free_swap() with explicit folio_lock() + * in usual operations. + */ +- if (folio_trylock(folio)) { +- if ((flags & TTRS_ANYWAY) || +- ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || +- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) +- ret = folio_free_swap(folio); +- folio_unlock(folio); ++ if (!folio_trylock(folio)) ++ goto out; ++ ++ /* offset could point to the middle of a large folio */ ++ entry = folio->swap; ++ offset = swp_offset(entry); ++ ++ need_reclaim = ((flags & TTRS_ANYWAY) || ++ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ++ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); ++ if (!need_reclaim || !folio_swapcache_freeable(folio)) ++ goto out_unlock; ++ ++ /* ++ * It's safe to delete the folio from swap cache only if the folio's ++ * swap_map is HAS_CACHE only, which means the slots have no page table ++ * reference or pending writeback, and can't be allocated to others. ++ */ ++ ci = lock_cluster_or_swap_info(si, offset); ++ need_reclaim = swap_is_has_cache(si, offset, nr_pages); ++ unlock_cluster_or_swap_info(si, ci); ++ if (!need_reclaim) ++ goto out_unlock; ++ ++ if (!(flags & TTRS_DIRECT)) { ++ /* Free through slot cache */ ++ delete_from_swap_cache(folio); ++ folio_set_dirty(folio); ++ ret = nr_pages; ++ goto out_unlock; + } +- ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); ++ ++ xa_lock_irq(&address_space->i_pages); ++ __delete_from_swap_cache(folio, entry, NULL); ++ xa_unlock_irq(&address_space->i_pages); ++ folio_ref_sub(folio, nr_pages); ++ folio_set_dirty(folio); ++ ++ spin_lock(&si->lock); ++ /* Only sinple page folio can be backed by zswap */ ++ if (nr_pages == 1) ++ zswap_invalidate(entry); ++ swap_entry_range_free(si, entry, nr_pages); ++ spin_unlock(&si->lock); ++ ret = nr_pages; ++out_unlock: ++ folio_unlock(folio); ++out: + folio_put(folio); + return ret; + } +@@ -289,62 +362,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, + #endif + #define LATENCY_LIMIT 256 + +-static inline void cluster_set_flag(struct swap_cluster_info *info, +- unsigned int flag) +-{ +- info->flags = flag; +-} +- +-static inline unsigned int cluster_count(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_count(struct swap_cluster_info *info, +- unsigned int c) +-{ +- info->data = c; +-} +- +-static inline void cluster_set_count_flag(struct swap_cluster_info *info, +- unsigned int c, unsigned int f) +-{ +- info->flags = f; +- info->data = c; +-} +- +-static inline unsigned int cluster_next(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_next(struct swap_cluster_info *info, +- unsigned int n) +-{ +- info->data = n; +-} +- +-static inline void cluster_set_next_flag(struct swap_cluster_info *info, +- unsigned int n, unsigned int f) +-{ +- info->flags = f; +- info->data = n; +-} +- + static inline bool cluster_is_free(struct swap_cluster_info *info) + { + return info->flags & CLUSTER_FLAG_FREE; + } + +-static inline bool cluster_is_null(struct swap_cluster_info *info) ++static inline unsigned int cluster_index(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) + { +- return info->flags & CLUSTER_FLAG_NEXT_NULL; ++ return ci - si->cluster_info; + } + +-static inline void cluster_set_null(struct swap_cluster_info *info) ++static inline unsigned int cluster_offset(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) + { +- info->flags = CLUSTER_FLAG_NEXT_NULL; +- info->data = 0; ++ return cluster_index(si, ci) * SWAPFILE_CLUSTER; + } + + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, +@@ -393,65 +425,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + spin_unlock(&si->lock); + } + +-static inline bool cluster_list_empty(struct swap_cluster_list *list) +-{ +- return cluster_is_null(&list->head); +-} +- +-static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +-{ +- return cluster_next(&list->head); +-} +- +-static void cluster_list_init(struct swap_cluster_list *list) +-{ +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +-} +- +-static void cluster_list_add_tail(struct swap_cluster_list *list, +- struct swap_cluster_info *ci, +- unsigned int idx) +-{ +- if (cluster_list_empty(list)) { +- cluster_set_next_flag(&list->head, idx, 0); +- cluster_set_next_flag(&list->tail, idx, 0); +- } else { +- struct swap_cluster_info *ci_tail; +- unsigned int tail = cluster_next(&list->tail); +- +- /* +- * Nested cluster lock, but both cluster locks are +- * only acquired when we held swap_info_struct->lock +- */ +- ci_tail = ci + tail; +- spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); +- cluster_set_next(ci_tail, idx); +- spin_unlock(&ci_tail->lock); +- cluster_set_next_flag(&list->tail, idx, 0); +- } +-} +- +-static unsigned int cluster_list_del_first(struct swap_cluster_list *list, +- struct swap_cluster_info *ci) +-{ +- unsigned int idx; +- +- idx = cluster_next(&list->head); +- if (cluster_next(&list->tail) == idx) { +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +- } else +- cluster_set_next_flag(&list->head, +- cluster_next(&ci[idx]), 0); +- +- return idx; +-} +- + /* Add a cluster to discard list and schedule it to do discard */ + static void swap_cluster_schedule_discard(struct swap_info_struct *si, +- unsigned int idx) ++ struct swap_cluster_info *ci) + { ++ unsigned int idx = cluster_index(si, ci); + /* + * If scan_swap_map_slots() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't +@@ -461,17 +439,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + +- cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); +- ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); ++ list_move_tail(&ci->list, &si->discard_clusters); ++ ci->flags = 0; + schedule_work(&si->discard_work); + } + +-static void __free_cluster(struct swap_info_struct *si, unsigned long idx) ++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info; ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + +- cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&si->free_clusters, ci, idx); ++ if (ci->flags) ++ list_move_tail(&ci->list, &si->free_clusters); ++ else ++ list_add_tail(&ci->list, &si->free_clusters); ++ ci->flags = CLUSTER_FLAG_FREE; ++ ci->order = 0; + } + + /* +@@ -480,24 +464,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + static void swap_do_scheduled_discard(struct swap_info_struct *si) + { +- struct swap_cluster_info *info, *ci; ++ struct swap_cluster_info *ci; + unsigned int idx; + +- info = si->cluster_info; +- +- while (!cluster_list_empty(&si->discard_clusters)) { +- idx = cluster_list_del_first(&si->discard_clusters, info); ++ while (!list_empty(&si->discard_clusters)) { ++ ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); ++ list_del(&ci->list); ++ idx = cluster_index(si, ci); + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); +- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); +- __free_cluster(si, idx); ++ spin_lock(&ci->lock); ++ __free_cluster(si, ci); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); +- unlock_cluster(ci); ++ spin_unlock(&ci->lock); + } + } + +@@ -520,20 +504,15 @@ static void swap_users_ref_free(struct percpu_ref *ref) + complete(&si->comp); + } + +-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) ++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info; ++ VM_BUG_ON(ci->count != 0); ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + +- VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); +- cluster_list_del_first(&si->free_clusters, ci); +- cluster_set_count_flag(ci + idx, 0, 0); +-} +- +-static void free_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- struct swap_cluster_info *ci = si->cluster_info + idx; ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; + +- VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -541,160 +520,374 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { +- swap_cluster_schedule_discard(si, idx); ++ swap_cluster_schedule_discard(si, ci); + return; + } + +- __free_cluster(si, idx); ++ __free_cluster(si, ci); + } + + /* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by +- * count. ++ * The cluster corresponding to page_nr will be used. The cluster will not be ++ * added to free cluster list and its usage counter will be increased by 1. ++ * Only used for initialization. + */ +-static void add_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr, +- unsigned long count) ++static void inc_cluster_info_page(struct swap_info_struct *p, ++ struct swap_cluster_info *cluster_info, unsigned long page_nr) + { + unsigned long idx = page_nr / SWAPFILE_CLUSTER; ++ struct swap_cluster_info *ci; + + if (!cluster_info) + return; +- if (cluster_is_free(&cluster_info[idx])) +- alloc_cluster(p, idx); + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) + count); +-} ++ ci = cluster_info + idx; ++ ci->count++; + +-/* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by 1. +- */ +-static void inc_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) +-{ +- add_cluster_info_page(p, cluster_info, page_nr, 1); ++ VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); ++ VM_BUG_ON(ci->flags); + } + + /* +- * The cluster corresponding to page_nr decreases one usage. If the usage +- * counter becomes 0, which means no page in the cluster is in using, we can +- * optionally discard the cluster and add it to free cluster list. ++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, ++ * which means no page in the cluster is in use, we can optionally discard ++ * the cluster and add it to free cluster list. + */ + static void dec_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) ++ struct swap_cluster_info *ci, int nr_pages) + { +- unsigned long idx = page_nr / SWAPFILE_CLUSTER; +- +- if (!cluster_info) ++ if (!p->cluster_info) + return; + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) - 1); ++ VM_BUG_ON(ci->count < nr_pages); ++ VM_BUG_ON(cluster_is_free(ci)); ++ lockdep_assert_held(&p->lock); ++ lockdep_assert_held(&ci->lock); ++ ci->count -= nr_pages; ++ ++ if (!ci->count) { ++ free_cluster(p, ci); ++ return; ++ } + +- if (cluster_count(&cluster_info[idx]) == 0) +- free_cluster(p, idx); ++ if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ p->frag_cluster_nr[ci->order]--; ++ list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ } + } + +-/* +- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free +- * cluster list. Avoiding such abuse to avoid list corruption. +- */ +-static bool +-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, +- unsigned long offset, int order) ++static bool cluster_reclaim_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned long end) + { +- struct percpu_cluster *percpu_cluster; +- bool conflict; ++ unsigned char *map = si->swap_map; ++ unsigned long offset; + +- offset /= SWAPFILE_CLUSTER; +- conflict = !cluster_list_empty(&si->free_clusters) && +- offset != cluster_list_first(&si->free_clusters) && +- cluster_is_free(&si->cluster_info[offset]); ++ spin_unlock(&ci->lock); ++ spin_unlock(&si->lock); + +- if (!conflict) +- return false; ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) ++ continue; ++ goto out; ++ default: ++ goto out; ++ } ++ } ++out: ++ spin_lock(&si->lock); ++ spin_lock(&ci->lock); ++ ++ /* ++ * Recheck the range no matter reclaim succeeded or not, the slot ++ * could have been be freed while we are not holding the lock. ++ */ ++ for (offset = start; offset < end; offset++) ++ if (READ_ONCE(map[offset])) ++ return false; + +- percpu_cluster = this_cpu_ptr(si->percpu_cluster); +- percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; + } + +-static inline bool swap_range_empty(char *swap_map, unsigned int start, +- unsigned int nr_pages) ++static bool cluster_scan_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned int nr_pages) + { +- unsigned int i; ++ unsigned long offset, end = start + nr_pages; ++ unsigned char *map = si->swap_map; ++ bool need_reclaim = false; + +- for (i = 0; i < nr_pages; i++) { +- if (swap_map[start + i]) ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (!vm_swap_full()) ++ return false; ++ need_reclaim = true; ++ continue; ++ default: + return false; ++ } + } + ++ if (need_reclaim) ++ return cluster_reclaim_range(si, ci, start, end); ++ + return true; + } + ++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++ unsigned int start, unsigned char usage, ++ unsigned int order) ++{ ++ unsigned int nr_pages = 1 << order; ++ ++ if (!(si->flags & SWP_WRITEOK)) ++ return false; ++ ++ if (cluster_is_free(ci)) { ++ if (nr_pages < SWAPFILE_CLUSTER) { ++ list_move_tail(&ci->list, &si->nonfull_clusters[order]); ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ } ++ ci->order = order; ++ } ++ ++ memset(si->swap_map + start, usage, nr_pages); ++ swap_range_alloc(si, start, nr_pages); ++ ci->count += nr_pages; ++ ++ if (ci->count == SWAPFILE_CLUSTER) { ++ VM_BUG_ON(!(ci->flags & ++ (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; ++ list_move_tail(&ci->list, &si->full_clusters); ++ ci->flags = CLUSTER_FLAG_FULL; ++ } ++ ++ return true; ++} ++ ++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, ++ unsigned int *foundp, unsigned int order, ++ unsigned char usage) ++{ ++ unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); ++ unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); ++ unsigned int nr_pages = 1 << order; ++ struct swap_cluster_info *ci; ++ ++ if (end < nr_pages) ++ return SWAP_NEXT_INVALID; ++ end -= nr_pages; ++ ++ ci = lock_cluster(si, offset); ++ if (ci->count + nr_pages > SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ ++ while (offset <= end) { ++ if (cluster_scan_range(si, ci, offset, nr_pages)) { ++ if (!cluster_alloc_range(si, ci, offset, usage, order)) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ *foundp = offset; ++ if (ci->count == SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ offset += nr_pages; ++ break; ++ } ++ offset += nr_pages; ++ } ++ if (offset > end) ++ offset = SWAP_NEXT_INVALID; ++done: ++ unlock_cluster(ci); ++ return offset; ++} ++ ++/* Return true if reclaimed a whole cluster */ ++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) ++{ ++ long to_scan = 1; ++ unsigned long offset, end; ++ struct swap_cluster_info *ci; ++ unsigned char *map = si->swap_map; ++ int nr_reclaim; ++ ++ if (force) ++ to_scan = si->inuse_pages / SWAPFILE_CLUSTER; ++ ++ while (!list_empty(&si->full_clusters)) { ++ ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->full_clusters); ++ offset = cluster_offset(si, ci); ++ end = min(si->max, offset + SWAPFILE_CLUSTER); ++ to_scan--; ++ ++ spin_unlock(&si->lock); ++ while (offset < end) { ++ if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { ++ nr_reclaim = __try_to_reclaim_swap(si, offset, ++ TTRS_ANYWAY | TTRS_DIRECT); ++ if (nr_reclaim) { ++ offset += abs(nr_reclaim); ++ continue; ++ } ++ } ++ offset++; ++ } ++ spin_lock(&si->lock); ++ ++ if (to_scan <= 0) ++ break; ++ } ++} ++ ++static void swap_reclaim_work(struct work_struct *work) ++{ ++ struct swap_info_struct *si; ++ ++ si = container_of(work, struct swap_info_struct, reclaim_work); ++ ++ spin_lock(&si->lock); ++ swap_reclaim_full_clusters(si, true); ++ spin_unlock(&si->lock); ++} ++ + /* + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. + */ +-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +- unsigned long *offset, unsigned long *scan_base, int order) ++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, ++ unsigned char usage) + { +- unsigned int nr_pages = 1 << order; + struct percpu_cluster *cluster; + struct swap_cluster_info *ci; +- unsigned int tmp, max; ++ unsigned int offset, found = 0; + + new_cluster: ++ lockdep_assert_held(&si->lock); + cluster = this_cpu_ptr(si->percpu_cluster); +- tmp = cluster->next[order]; +- if (tmp == SWAP_NEXT_INVALID) { +- if (!cluster_list_empty(&si->free_clusters)) { +- tmp = cluster_next(&si->free_clusters.head) * +- SWAPFILE_CLUSTER; +- } else if (!cluster_list_empty(&si->discard_clusters)) { +- /* +- * we don't have free cluster but have some clusters in +- * discarding, do discard now and reclaim them, then +- * reread cluster_next_cpu since we dropped si->lock +- */ +- swap_do_scheduled_discard(si); +- *scan_base = this_cpu_read(*si->cluster_next_cpu); +- *offset = *scan_base; +- goto new_cluster; +- } else +- return false; ++ offset = cluster->next[order]; ++ if (offset) { ++ offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); ++ if (found) ++ goto done; + } + +- /* +- * Other CPUs can use our cluster if they can't find a free cluster, +- * check if there is still free entry in the cluster, maintaining +- * natural alignment. +- */ +- max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); +- if (tmp < max) { +- ci = lock_cluster(si, tmp); +- while (tmp < max) { +- if (swap_range_empty(si->swap_map, tmp, nr_pages)) ++ if (!list_empty(&si->free_clusters)) { ++ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); ++ /* ++ * Either we didn't touch the cluster due to swapoff, ++ * or the allocation must success. ++ */ ++ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); ++ goto done; ++ } ++ ++ /* Try reclaim from full clusters if free clusters list is drained */ ++ if (vm_swap_full()) ++ swap_reclaim_full_clusters(si, false); ++ ++ if (order < PMD_ORDER) { ++ unsigned int frags = 0; ++ ++ while (!list_empty(&si->nonfull_clusters[order])) { ++ ci = list_first_entry(&si->nonfull_clusters[order], ++ struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->frag_clusters[order]); ++ ci->flags = CLUSTER_FLAG_FRAG; ++ si->frag_cluster_nr[order]++; ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ frags++; ++ if (found) + break; +- tmp += nr_pages; + } +- unlock_cluster(ci); ++ ++ if (!found) { ++ /* ++ * Nonfull clusters are moved to frag tail if we reached ++ * here, count them too, don't over scan the frag list. ++ */ ++ while (frags < si->frag_cluster_nr[order]) { ++ ci = list_first_entry(&si->frag_clusters[order], ++ struct swap_cluster_info, list); ++ /* ++ * Rotate the frag list to iterate, they were all failing ++ * high order allocation or moved here due to per-CPU usage, ++ * this help keeping usable cluster ahead. ++ */ ++ list_move_tail(&ci->list, &si->frag_clusters[order]); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ frags++; ++ if (found) ++ break; ++ } ++ } + } +- if (tmp >= max) { +- cluster->next[order] = SWAP_NEXT_INVALID; ++ ++ if (found) ++ goto done; ++ ++ if (!list_empty(&si->discard_clusters)) { ++ /* ++ * we don't have free cluster but have some clusters in ++ * discarding, do discard now and reclaim them, then ++ * reread cluster_next_cpu since we dropped si->lock ++ */ ++ swap_do_scheduled_discard(si); + goto new_cluster; + } +- *offset = tmp; +- *scan_base = tmp; +- tmp += nr_pages; +- cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; +- return true; ++ ++ if (order) ++ goto done; ++ ++ /* Order 0 stealing from higher order */ ++ for (int o = 1; o < SWAP_NR_ORDERS; o++) { ++ /* ++ * Clusters here have at least one usable slots and can't fail order 0 ++ * allocation, but reclaim may drop si->lock and race with another user. ++ */ ++ while (!list_empty(&si->frag_clusters[o])) { ++ ci = list_first_entry(&si->frag_clusters[o], ++ struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ if (found) ++ goto done; ++ } ++ ++ while (!list_empty(&si->nonfull_clusters[o])) { ++ ci = list_first_entry(&si->nonfull_clusters[o], ++ struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ if (found) ++ goto done; ++ } ++ } ++ ++done: ++ cluster->next[order] = offset; ++ return found; + } + + static void __del_from_avail_list(struct swap_info_struct *p) +@@ -727,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + si->lowest_bit = si->max; + si->highest_bit = 0; + del_from_avail_list(si); ++ ++ if (si->cluster_info && vm_swap_full()) ++ schedule_work(&si->reclaim_work); + } + } + +@@ -765,7 +961,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + swap_slot_free_notify = NULL; + while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); +- zswap_invalidate(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; +@@ -816,11 +1011,33 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, + return false; + } + ++static int cluster_alloc_swap(struct swap_info_struct *si, ++ unsigned char usage, int nr, ++ swp_entry_t slots[], int order) ++{ ++ int n_ret = 0; ++ ++ VM_BUG_ON(!si->cluster_info); ++ ++ si->flags += SWP_SCANNING; ++ ++ while (n_ret < nr) { ++ unsigned long offset = cluster_alloc_swap_entry(si, order, usage); ++ ++ if (!offset) ++ break; ++ slots[n_ret++] = swp_entry(si->type, offset); ++ } ++ ++ si->flags -= SWP_SCANNING; ++ ++ return n_ret; ++} ++ + static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) + { +- struct swap_cluster_info *ci; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; +@@ -859,26 +1076,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return 0; + } + ++ if (si->cluster_info) ++ return cluster_alloc_swap(si, usage, nr, slots, order); ++ + si->flags += SWP_SCANNING; +- /* +- * Use percpu scan base for SSD to reduce lock contention on +- * cluster and swap cache. For HDD, sequential access is more +- * important. +- */ +- if (si->flags & SWP_SOLIDSTATE) +- scan_base = this_cpu_read(*si->cluster_next_cpu); +- else +- scan_base = si->cluster_next; ++ ++ /* For HDD, sequential access is more important. */ ++ scan_base = si->cluster_next; + offset = scan_base; + +- /* SSD algorithm */ +- if (si->cluster_info) { +- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } else if (unlikely(!si->cluster_nr--)) { ++ if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; +@@ -889,8 +1096,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. +- * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info +- * case, just handled by scan_swap_map_try_ssd_cluster() above. + */ + scan_base = offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; +@@ -918,19 +1123,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + checks: +- if (si->cluster_info) { +- while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { +- /* take a break if we already got some slots */ +- if (n_ret) +- goto done; +- if (!scan_swap_map_try_ssd_cluster(si, &offset, +- &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } +- } + if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) +@@ -938,13 +1130,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + +- ci = lock_cluster(si, offset); + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; +- unlock_cluster(ci); + spin_unlock(&si->lock); +- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); ++ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&si->lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed > 0) +@@ -953,15 +1143,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + if (si->swap_map[offset]) { +- unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } + memset(si->swap_map + offset, usage, nr_pages); +- add_cluster_info_page(si, si->cluster_info, offset, nr_pages); +- unlock_cluster(ci); + + swap_range_alloc(si, offset, nr_pages); + slots[n_ret++] = swp_entry(si->type, offset); +@@ -982,13 +1169,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + latency_ration = LATENCY_LIMIT; + } + +- /* try to get more slots in cluster */ +- if (si->cluster_info) { +- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) +- goto checks; +- if (order > 0) +- goto done; +- } else if (si->cluster_nr && !si->swap_map[++offset]) { ++ if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ + --si->cluster_nr; + goto checks; +@@ -1049,19 +1230,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return n_ret; + } + +-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- unsigned long offset = idx * SWAPFILE_CLUSTER; +- struct swap_cluster_info *ci; +- +- ci = lock_cluster(si, offset); +- memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); +- cluster_set_count_flag(ci, 0, 0); +- free_cluster(si, idx); +- unlock_cluster(ci); +- swap_range_free(si, offset, SWAPFILE_CLUSTER); +-} +- + #ifdef CONFIG_MEMCG_SWAP_QOS + int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) + { +@@ -1409,21 +1577,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, + return usage; + } + +-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) ++/* ++ * Drop the last HAS_CACHE flag of swap entries, caller have to ++ * ensure all entries belong to the same cgroup. ++ */ ++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, ++ unsigned int nr_pages) + { +- struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); +- unsigned char count; ++ unsigned char *map = p->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ struct swap_cluster_info *ci; + + ci = lock_cluster(p, offset); +- count = p->swap_map[offset]; +- VM_BUG_ON(count != SWAP_HAS_CACHE); +- p->swap_map[offset] = 0; +- dec_cluster_info_page(p, p->cluster_info, offset); ++ do { ++ VM_BUG_ON(*map != SWAP_HAS_CACHE); ++ *map = 0; ++ } while (++map < map_end); ++ dec_cluster_info_page(p, ci, nr_pages); + unlock_cluster(ci); + +- mem_cgroup_uncharge_swap(entry, 1); +- swap_range_free(p, offset, 1); ++ mem_cgroup_uncharge_swap(entry, nr_pages); ++ swap_range_free(p, offset, nr_pages); + } + + static void cluster_swap_free_nr(struct swap_info_struct *sis, +@@ -1484,12 +1659,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) + void put_swap_folio(struct folio *folio, swp_entry_t entry) + { + unsigned long offset = swp_offset(entry); +- unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; +- unsigned char *map; +- unsigned int i, free_entries = 0; +- unsigned char val; + int size = 1 << swap_entry_order(folio_order(folio)); + + si = _swap_info_get(entry); +@@ -1497,24 +1668,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + return; + + ci = lock_cluster_or_swap_info(si, offset); +- if (size == SWAPFILE_CLUSTER) { +- map = si->swap_map + offset; +- for (i = 0; i < SWAPFILE_CLUSTER; i++) { +- val = map[i]; +- VM_BUG_ON(!(val & SWAP_HAS_CACHE)); +- if (val == SWAP_HAS_CACHE) +- free_entries++; +- } +- if (free_entries == SWAPFILE_CLUSTER) { +- unlock_cluster_or_swap_info(si, ci); +- spin_lock(&si->lock); +- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); +- swap_free_cluster(si, idx); +- spin_unlock(&si->lock); +- return; +- } ++ if (size > 1 && swap_is_has_cache(si, offset, size)) { ++ unlock_cluster_or_swap_info(si, ci); ++ spin_lock(&si->lock); ++ swap_entry_range_free(si, entry, size); ++ spin_unlock(&si->lock); ++ return; + } +- for (i = 0; i < size; i++, entry.val++) { ++ for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); +@@ -1554,7 +1715,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) +- swap_entry_free(p, entries[i]); ++ swap_entry_range_free(p, entries[i], 1); + prev = p; + } + if (p) +@@ -1674,16 +1835,7 @@ static bool folio_swapped(struct folio *folio) + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); + } + +-/** +- * folio_free_swap() - Free the swap space used for this folio. +- * @folio: The folio to remove. +- * +- * If swap is getting full, or if there are no more mappings of this folio, +- * then call folio_free_swap to free its swap space. +- * +- * Return: true if we were able to release the swap space. +- */ +-bool folio_free_swap(struct folio *folio) ++static bool folio_swapcache_freeable(struct folio *folio) + { + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + +@@ -1691,8 +1843,6 @@ bool folio_free_swap(struct folio *folio) + return false; + if (folio_test_writeback(folio)) + return false; +- if (folio_swapped(folio)) +- return false; + + /* + * Once hibernation has begun to create its image of memory, +@@ -1712,6 +1862,25 @@ bool folio_free_swap(struct folio *folio) + if (pm_suspended_storage()) + return false; + ++ return true; ++} ++ ++/** ++ * folio_free_swap() - Free the swap space used for this folio. ++ * @folio: The folio to remove. ++ * ++ * If swap is getting full, or if there are no more mappings of this folio, ++ * then call folio_free_swap to free its swap space. ++ * ++ * Return: true if we were able to release the swap space. ++ */ ++bool folio_free_swap(struct folio *folio) ++{ ++ if (!folio_swapcache_freeable(folio)) ++ return false; ++ if (folio_swapped(folio)) ++ return false; ++ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + return true; +@@ -1788,7 +1957,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, +- TTRS_UNMAPPED | TTRS_FULL); ++ TTRS_UNMAPPED | TTRS_FULL); + if (nr == 0) + nr = 1; + else if (nr < 0) +@@ -2686,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + wait_for_completion(&p->comp); + + flush_work(&p->discard_work); ++ flush_work(&p->reclaim_work); + + destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) +@@ -3114,8 +3284,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + + nr_good_pages = maxpages - 1; /* omit header page */ + +- cluster_list_init(&p->free_clusters); +- cluster_list_init(&p->discard_clusters); ++ INIT_LIST_HEAD(&p->free_clusters); ++ INIT_LIST_HEAD(&p->full_clusters); ++ INIT_LIST_HEAD(&p->discard_clusters); ++ ++ for (i = 0; i < SWAP_NR_ORDERS; i++) { ++ INIT_LIST_HEAD(&p->nonfull_clusters[i]); ++ INIT_LIST_HEAD(&p->frag_clusters[i]); ++ p->frag_cluster_nr[i] = 0; ++ } + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; +@@ -3158,7 +3335,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + if (!cluster_info) + return nr_extents; + +- + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. +@@ -3166,14 +3342,18 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { ++ struct swap_cluster_info *ci; + idx = i * SWAP_CLUSTER_COLS + j; ++ ci = cluster_info + idx; + if (idx >= nr_clusters) + continue; +- if (cluster_count(&cluster_info[idx])) ++ if (ci->count) { ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ list_add_tail(&ci->list, &p->nonfull_clusters[0]); + continue; +- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&p->free_clusters, cluster_info, +- idx); ++ } ++ ci->flags = CLUSTER_FLAG_FREE; ++ list_add_tail(&ci->list, &p->free_clusters); + } + } + return nr_extents; +@@ -3212,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); ++ INIT_WORK(&p->reclaim_work, swap_reclaim_work); + + name = getname(specialfile); + if (IS_ERR(name)) { +diff --git a/mm/zswap.c b/mm/zswap.c +index 69681b9173fd..5acda5b906bc 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio) + return ret; + } + +-void zswap_invalidate(int type, pgoff_t offset) ++void zswap_invalidate(swp_entry_t swp) + { +- struct zswap_tree *tree = zswap_trees[type]; ++ pgoff_t offset = swp_offset(swp); ++ struct zswap_tree *tree = zswap_trees[swp_type(swp)]; + struct zswap_entry *entry; + + /* find */ +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index ca8cc0988b61..bd032ac2376e 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t) + + drop: + __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); +- reqsk_put(req); ++ reqsk_put(oreq); + } + + static bool reqsk_queue_hash_req(struct request_sock *req, +diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h +index c112c6f7c766..9b302242be6c 100644 +--- a/tools/include/uapi/linux/bpf.h ++++ b/tools/include/uapi/linux/bpf.h +@@ -6576,6 +6576,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { diff --git a/kernel.spec b/kernel.spec index 884d5480d44cdbbbe954315eb3b1ef5ac30ee3c9..2fd5f70e4d2334fbcc245ef00a90aeccce2c1749 100644 --- a/kernel.spec +++ b/kernel.spec @@ -1,5 +1,5 @@ %define with_signmodules 1 -%define with_kabichk 1 +%define with_kabichk 0 # Default without toolchain_clang %bcond_with toolchain_clang @@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig %global upstream_sublevel 0 %global devel_release 68 %global maintenance_release .0.0 -%global pkg_release .73 +%global pkg_release .74 %global openeuler_lts 1 %global openeuler_major 2403 @@ -128,6 +128,7 @@ Patch0001: 0001-riscv-kernel.patch Patch0002: 0002-cpupower-clang-compile-support.patch Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch Patch0004: 0004-turbostat-clang-compile-support.patch +Patch0006: 0006-kabi_test.patch #BuildRequires: BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar @@ -369,6 +370,8 @@ Applypatches series.conf %{_builddir}/kernel-%{version}/linux-%{KernelVer} %patch0004 -p1 %endif +%patch0006 -p1 + find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null find . -name .gitignore -exec rm -f {} \; >/dev/null @@ -1089,6 +1092,9 @@ fi %endif %changelog +* Wed Dec 18 2024 Zheng Zengkai - 6.6.0-68.0.0.74 +- performance for kabi + * Tue Dec 17 2024 Xie XiuQi - 6.6.0-68.0.0.73 - kabi: add kabi_ext1 list for checking - check-kabi: fix kabi check failed when no namespace