diff --git a/0006-kabi_test.patch b/0006-kabi_test.patch
new file mode 100644
index 0000000000000000000000000000000000000000..68850724369f15dbf37378f44970dd91302faeda
--- /dev/null
+++ b/0006-kabi_test.patch
@@ -0,0 +1,3831 @@
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index fcd0c3b2065d..a6bbe6029121 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2075,6 +2075,7 @@ config HYGON_CSV
+ 	bool "Hygon secure virtualization CSV support"
+ 	default y
+ 	depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT
++	select CONFIG_CMA
+ 	help
+ 	  Hygon CSV integrates secure processor, memory encryption and
+ 	  memory isolation to provide the ability to protect guest's private
+diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
+index 8e8542796a13..adfaef0cb10c 100644
+--- a/arch/x86/configs/openeuler_defconfig
++++ b/arch/x86/configs/openeuler_defconfig
+@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
+ CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
+ CONFIG_USE_PERCPU_NUMA_NODE_ID=y
+ CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+-# CONFIG_CMA is not set
++CONFIG_CMA=y
++# CONFIG_CMA_DEBUG is not set
++# CONFIG_CMA_DEBUGFS is not set
++# CONFIG_CMA_SYSFS is not set
++CONFIG_CMA_AREAS=19
+ CONFIG_MEM_SOFT_DIRTY=y
+ CONFIG_GENERIC_EARLY_IOREMAP=y
+ CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
+@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y
+ CONFIG_SWIOTLB=y
+ # CONFIG_SWIOTLB_DYNAMIC is not set
+ CONFIG_DMA_COHERENT_POOL=y
++CONFIG_DMA_CMA=y
++# CONFIG_DMA_NUMA_CMA is not set
++
++#
++# Default contiguous memory area size:
++#
++CONFIG_CMA_SIZE_MBYTES=0
++CONFIG_CMA_SIZE_SEL_MBYTES=y
++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set
++# CONFIG_CMA_SIZE_SEL_MIN is not set
++# CONFIG_CMA_SIZE_SEL_MAX is not set
++CONFIG_CMA_ALIGNMENT=8
+ # CONFIG_DMA_API_DEBUG is not set
+ # CONFIG_DMA_MAP_BENCHMARK is not set
+ CONFIG_SGL_ALLOC=y
+diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
+index bb13894ad152..e87f878178f3 100644
+--- a/fs/nfs/nfs4super.c
++++ b/fs/nfs/nfs4super.c
+@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server,
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 
+-	dentry->d_sb->s_flags = fc->sb_flags;
++	dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY);
+ 	fc->root = dentry;
+ 	return 0;
+ }
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 6e3227a688de..f3fd0407d346 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -325,6 +325,8 @@ struct cgroup_base_stat {
+ #ifdef CONFIG_SCHED_CORE
+ 	u64 forceidle_sum;
+ #endif
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ /*
+@@ -555,6 +557,9 @@ struct cgroup {
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ 	/* All ancestors including self */
+ 	struct cgroup *ancestors[];
+ };
+@@ -573,6 +578,10 @@ struct cgroup_root {
+ 	/* Unique id for this hierarchy. */
+ 	int hierarchy_id;
+ 
++	/* A list running through the active hierarchies */
++	struct list_head root_list;
++	struct rcu_head rcu;    /* Must be near the top */
++
+ 	/*
+ 	 * The root cgroup. The containing cgroup_root will be destroyed on its
+ 	 * release. cgrp->ancestors[0] will be used overflowing into the
+@@ -589,9 +598,6 @@ struct cgroup_root {
+ 	/* Wait while cgroups are being destroyed */
+ 	wait_queue_head_t wait;
+ 
+-	/* A list running through the active hierarchies */
+-	struct list_head root_list;
+-
+ 	/* Hierarchy-specific flags */
+ 	unsigned int flags;
+ 
+@@ -605,6 +611,8 @@ struct cgroup_root {
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
+ };
+ 
+ /*
+diff --git a/include/linux/i2c.h b/include/linux/i2c.h
+index 32cf5708d5a5..3fd6932bf8cd 100644
+--- a/include/linux/i2c.h
++++ b/include/linux/i2c.h
+@@ -746,6 +746,9 @@ struct i2c_adapter {
+ 
+ 	struct irq_domain *host_notify_domain;
+ 	struct regulator *bus_regulator;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
+ 
+diff --git a/include/linux/iommu.h b/include/linux/iommu.h
+index bb463cb96a44..83ec4bf9809e 100644
+--- a/include/linux/iommu.h
++++ b/include/linux/iommu.h
+@@ -155,6 +155,10 @@ struct iopf_group {
+ 	KABI_USE(2, u32 cookie)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct iopf_group_extend {
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index b2a80e089a0a..abe236201e68 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -429,6 +429,14 @@ struct mem_cgroup {
+ 	KABI_RESERVE(6)
+ 	KABI_RESERVE(7)
+ 	KABI_RESERVE(8)
++	KABI_RESERVE(9)
++	KABI_RESERVE(10)
++	KABI_RESERVE(11)
++	KABI_RESERVE(12)
++	KABI_RESERVE(13)
++	KABI_RESERVE(14)
++	KABI_RESERVE(15)
++	KABI_RESERVE(16)
+ 	struct mem_cgroup_per_node *nodeinfo[];
+ };
+ 
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 2e6ef9532fc3..b6dcdaafc592 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3819,24 +3819,22 @@ static inline bool page_is_guard(struct page *page)
+ 	return PageGuard(page);
+ }
+ 
+-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype);
++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
+ static inline bool set_page_guard(struct zone *zone, struct page *page,
+-				  unsigned int order, int migratetype)
++				  unsigned int order)
+ {
+ 	if (!debug_guardpage_enabled())
+ 		return false;
+-	return __set_page_guard(zone, page, order, migratetype);
++	return __set_page_guard(zone, page, order);
+ }
+ 
+-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-			int migratetype);
++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
+ static inline void clear_page_guard(struct zone *zone, struct page *page,
+-				    unsigned int order, int migratetype)
++				    unsigned int order)
+ {
+ 	if (!debug_guardpage_enabled())
+ 		return;
+-	__clear_page_guard(zone, page, order, migratetype);
++	__clear_page_guard(zone, page, order);
+ }
+ 
+ #else	/* CONFIG_DEBUG_PAGEALLOC */
+@@ -3846,9 +3844,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+ static inline bool debug_guardpage_enabled(void) { return false; }
+ static inline bool page_is_guard(struct page *page) { return false; }
+ static inline bool set_page_guard(struct zone *zone, struct page *page,
+-			unsigned int order, int migratetype) { return false; }
++			unsigned int order) { return false; }
+ static inline void clear_page_guard(struct zone *zone, struct page *page,
+-				unsigned int order, int migratetype) {}
++				unsigned int order) {}
+ #endif	/* CONFIG_DEBUG_PAGEALLOC */
+ 
+ #ifdef __HAVE_ARCH_GATE_AREA
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 3cee238de7c8..18bee72ebc71 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -865,6 +865,7 @@ struct zone {
+ 	unsigned long watermark_boost;
+ 
+ 	unsigned long nr_reserved_highatomic;
++	unsigned long nr_free_highatomic;
+ 
+ 	/*
+ 	 * We don't know if the memory that we're going to allocate will be
+diff --git a/include/linux/msi.h b/include/linux/msi.h
+index 7354ffb14856..5fd8a6caae98 100644
+--- a/include/linux/msi.h
++++ b/include/linux/msi.h
+@@ -205,15 +205,12 @@ struct msi_desc {
+ 	union {
+ 		struct pci_msi_desc	pci;
+ 		struct msi_desc_data	data;
+-		KABI_RESERVE(1)
+-		KABI_RESERVE(2)
+-		KABI_RESERVE(3)
+-		KABI_RESERVE(4)
++		KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5)
+ 	};
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
+-	KABI_RESERVE(6)
+-	KABI_RESERVE(7)
+-	KABI_RESERVE(8)
+ };
+ 
+ /*
+diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
+index 4ac34392823a..c16db0067090 100644
+--- a/include/linux/page-isolation.h
++++ b/include/linux/page-isolation.h
+@@ -34,8 +34,9 @@ static inline bool is_migrate_isolate(int migratetype)
+ #define REPORT_FAILURE	0x2
+ 
+ void set_pageblock_migratetype(struct page *page, int migratetype);
+-int move_freepages_block(struct zone *zone, struct page *page,
+-				int migratetype, int *num_movable);
++
++bool move_freepages_block_isolate(struct zone *zone, struct page *page,
++				  int migratetype);
+ 
+ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ 			     int migratetype, int flags, gfp_t gfp_flags);
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 429627abfef4..e44e377661f2 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -203,12 +203,21 @@ enum mapping_flags {
+ 	AS_EXITING	= 4, 	/* final truncate in progress */
+ 	/* writeback related tags are not used */
+ 	AS_NO_WRITEBACK_TAGS = 5,
+-	AS_LARGE_FOLIO_SUPPORT = 6,
+-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+-	AS_STABLE_WRITES,	/* must wait for writeback before modifying
++	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
++	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
+ 				   folio contents */
++	AS_INACCESSIBLE = 8,	/* Do not attempt direct R/W access to the mapping */
++	/* Bits 16-25 are used for FOLIO_ORDER */
++	AS_FOLIO_ORDER_BITS = 5,
++	AS_FOLIO_ORDER_MIN = 16,
++	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
+ };
+ 
++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
++
+ /**
+  * mapping_set_error - record a writeback error in the address_space
+  * @mapping: the mapping in which an error should be set
+@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+ #define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
+ #define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+ 
++/*
++ * mapping_set_folio_order_range() - Set the orders supported by a file.
++ * @mapping: The address space of the file.
++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
++ *
++ * The filesystem should call this function in its inode constructor to
++ * indicate which base size (min) and maximum size (max) of folio the VFS
++ * can use to cache the contents of the file.  This should only be used
++ * if the filesystem needs special handling of folio sizes (ie there is
++ * something the core cannot know).
++ * Do not tune it based on, eg, i_size.
++ *
++ * Context: This should not be called while the inode is active as it
++ * is non-atomic.
++ */
++static inline void mapping_set_folio_order_range(struct address_space *mapping,
++						 unsigned int min,
++						 unsigned int max)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return;
++
++	if (min > MAX_PAGECACHE_ORDER)
++		min = MAX_PAGECACHE_ORDER;
++
++	if (max > MAX_PAGECACHE_ORDER)
++		max = MAX_PAGECACHE_ORDER;
++
++	if (max < min)
++		max = min;
++
++	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
++		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
++}
++
++static inline void mapping_set_folio_min_order(struct address_space *mapping,
++					       unsigned int min)
++{
++	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
++}
++
+ /**
+  * mapping_set_large_folios() - Indicate the file supports large folios.
+- * @mapping: The file.
++ * @mapping: The address space of the file.
+  *
+  * The filesystem should call this function in its inode constructor to
+  * indicate that the VFS can use large folios to cache the contents of
+@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+  */
+ static inline void mapping_set_large_folios(struct address_space *mapping)
+ {
+-	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
++}
++
++static inline unsigned int
++mapping_max_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
++}
++
++static inline unsigned int
++mapping_min_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
+ }
+ 
+ /**
+@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
+ static inline void mapping_clear_large_folios(struct address_space *mapping)
+ {
+ 	WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock));
+-	__clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, 0);
+ }
+ 
+ /*
+@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping)
+  */
+ static inline bool mapping_large_folio_support(struct address_space *mapping)
+ {
+-	/* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
++	/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
+ 	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
+ 			"Anonymous mapping always supports large folio");
+ 
+-	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+-		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	return mapping_max_folio_order(mapping) > 0;
+ }
+ 
+ /* Return the maximum folio size for this pagecache mapping, in bytes. */
+-static inline size_t mapping_max_folio_size(struct address_space *mapping)
++static inline size_t mapping_max_folio_size(const struct address_space *mapping)
+ {
+-	if (mapping_large_folio_support(mapping))
+-		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
+-	return PAGE_SIZE;
++	return PAGE_SIZE << mapping_max_folio_order(mapping);
+ }
+ 
+ static inline int filemap_nr_thps(struct address_space *mapping)
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index 89f2a02db563..fe692e9bd0b2 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context {
+ 	struct hrtimer			hrtimer;
+ 	ktime_t				hrtimer_interval;
+ 	unsigned int			hrtimer_active;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ /**
+@@ -1031,6 +1039,14 @@ struct perf_cpu_context {
+ 	int				heap_size;
+ 	struct perf_event		**heap;
+ 	struct perf_event		*heap_default[2];
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct perf_output_handle {
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 8e4d9bbdaa40..09a2b2625202 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -85,6 +85,8 @@ struct sched_domain_shared {
+ #ifdef CONFIG_SCHED_STEAL
+ 	struct sparsemask *cfs_overload_cpus;
+ #endif
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ struct sched_domain {
+@@ -154,6 +156,9 @@ struct sched_domain {
+ 	};
+ 	struct sched_domain_shared *shared;
+ 
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++
+ 	unsigned int span_weight;
+ 	/*
+ 	 * Span of all CPUs in this domain.
+diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
+index 234bcdb1fba4..cf4a2258df85 100644
+--- a/include/linux/seq_file.h
++++ b/include/linux/seq_file.h
+@@ -27,6 +27,8 @@ struct seq_file {
+ 	int poll_event;
+ 	const struct file *file;
+ 	void *private;
++
++	KABI_RESERVE(1)
+ };
+ 
+ struct seq_operations {
+@@ -34,6 +36,8 @@ struct seq_operations {
+ 	void (*stop) (struct seq_file *m, void *v);
+ 	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
+ 	int (*show) (struct seq_file *m, void *v);
++
++	KABI_RESERVE(1)
+ };
+ 
+ #define SEQ_SKIP 1
+diff --git a/include/linux/stat.h b/include/linux/stat.h
+index 52150570d37a..d342e89b7aaa 100644
+--- a/include/linux/stat.h
++++ b/include/linux/stat.h
+@@ -53,6 +53,11 @@ struct kstat {
+ 	u32		dio_mem_align;
+ 	u32		dio_offset_align;
+ 	u64		change_cookie;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index bea0c0f1f640..33396153afc0 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -255,22 +255,24 @@ enum {
+  * free clusters are organized into a list. We fetch an entry from the list to
+  * get a free cluster.
+  *
+- * The data field stores next cluster if the cluster is free or cluster usage
+- * counter otherwise. The flags field determines if a cluster is free. This is
+- * protected by swap_info_struct.lock.
++ * The flags field determines if a cluster is free. This is
++ * protected by cluster lock.
+  */
+ struct swap_cluster_info {
+ 	spinlock_t lock;	/*
+ 				 * Protect swap_cluster_info fields
+-				 * and swap_info_struct->swap_map
+-				 * elements correspond to the swap
+-				 * cluster
++				 * other than list, and swap_info_struct->swap_map
++				 * elements corresponding to the swap cluster.
+ 				 */
+-	unsigned int data:24;
+-	unsigned int flags:8;
++	u16 count;
++	u8 flags;
++	u8 order;
++	struct list_head list;
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -295,11 +297,6 @@ struct percpu_cluster {
+ 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+ };
+ 
+-struct swap_cluster_list {
+-	struct swap_cluster_info head;
+-	struct swap_cluster_info tail;
+-};
+-
+ /*
+  * The in-memory structure used to track swap areas.
+  */
+@@ -312,7 +309,13 @@ struct swap_info_struct {
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+-	struct swap_cluster_list free_clusters; /* free clusters list */
++	struct list_head free_clusters; /* free clusters list */
++	struct list_head full_clusters; /* full clusters list */
++	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that contains at least one free slot */
++	struct list_head frag_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that are fragmented or contented */
++	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+@@ -345,7 +348,8 @@ struct swap_info_struct {
+ 					 * list.
+ 					 */
+ 	struct work_struct discard_work; /* discard worker */
+-	struct swap_cluster_list discard_clusters; /* discard clusters list */
++	struct work_struct reclaim_work; /* reclaim worker */
++	struct list_head discard_clusters; /* discard clusters list */
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
+index f46e0ca0169c..86d0868b584a 100644
+--- a/include/linux/uprobes.h
++++ b/include/linux/uprobes.h
+@@ -47,6 +47,7 @@ struct uprobe_consumer {
+ 
+ #ifdef CONFIG_UPROBES
+ #include <asm/uprobes.h>
++#include <linux/kabi.h>
+ 
+ enum uprobe_task_state {
+ 	UTASK_RUNNING,
+@@ -78,6 +79,14 @@ struct uprobe_task {
+ 
+ 	struct return_instance		*return_instances;
+ 	unsigned int			depth;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct return_instance {
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 343906a98d6e..735eae6e272c 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
+ 	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+ }
+ 
+-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
+-					     int migratetype)
+-{
+-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+-	if (is_migrate_cma(migratetype))
+-		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+-}
+-
+ extern const char * const vmstat_text[];
+ 
+ static inline const char *zone_stat_name(enum zone_stat_item item)
+diff --git a/include/linux/zswap.h b/include/linux/zswap.h
+index 2a60ce39cfde..a13d2d2d9131 100644
+--- a/include/linux/zswap.h
++++ b/include/linux/zswap.h
+@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages;
+ 
+ bool zswap_store(struct folio *folio);
+ bool zswap_load(struct folio *folio);
+-void zswap_invalidate(int type, pgoff_t offset);
++void zswap_invalidate(swp_entry_t swp);
+ void zswap_swapon(int type);
+ void zswap_swapoff(int type);
+ 
+@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio)
+ 	return false;
+ }
+ 
+-static inline void zswap_invalidate(int type, pgoff_t offset) {}
++static inline void zswap_invalidate(swp_entry_t swp) {}
+ static inline void zswap_swapon(int type) {}
+ static inline void zswap_swapoff(int type) {}
+ 
+diff --git a/include/net/flow.h b/include/net/flow.h
+index 0cc5f2ef1000..72d2ea2374ba 100644
+--- a/include/net/flow.h
++++ b/include/net/flow.h
+@@ -46,6 +46,8 @@ struct flowi_common {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ union flowi_uli {
+diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
+index 4b77a9b031b6..963588269637 100644
+--- a/include/net/netns/netfilter.h
++++ b/include/net/netns/netfilter.h
+@@ -34,5 +34,7 @@ struct netns_nf {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ #endif
+diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
+index a0c1359cc7eb..af7f20ef4823 100644
+--- a/include/net/netns/xfrm.h
++++ b/include/net/netns/xfrm.h
+@@ -87,6 +87,8 @@ struct netns_xfrm {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ #endif
+diff --git a/include/net/xdp.h b/include/net/xdp.h
+index c283668458ca..ebebadc56cd9 100644
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -54,6 +54,9 @@ enum xdp_mem_type {
+ struct xdp_mem_info {
+ 	u32 type; /* enum xdp_mem_type, but known size type */
+ 	u32 id;
++
++	KABI_RESERVE(1);
++	KABI_RESERVE(2);
+ };
+ 
+ struct page_pool;
+@@ -74,6 +77,9 @@ struct xdp_rxq_info {
+ 
+ struct xdp_txq_info {
+ 	struct net_device *dev;
++
++	KABI_RESERVE(1);
++	KABI_RESERVE(2);
+ };
+ 
+ enum xdp_buff_flags {
+@@ -92,6 +98,11 @@ struct xdp_buff {
+ 	struct xdp_txq_info *txq;
+ 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1);
++	KABI_RESERVE(2);
++	KABI_RESERVE(3);
++	KABI_RESERVE(4);
+ };
+ 
+ static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
+@@ -181,6 +192,11 @@ struct xdp_frame {
+ 	struct net_device *dev_rx; /* used by cpumap */
+ 	u32 frame_sz;
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1);
++	KABI_RESERVE(2);
++	KABI_RESERVE(3);
++	KABI_RESERVE(4);
+ };
+ 
+ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
+@@ -198,6 +214,9 @@ struct xdp_frame_bulk {
+ 	int count;
+ 	void *xa;
+ 	void *q[XDP_BULK_QUEUE_SIZE];
++
++	KABI_RESERVE(1);
++	KABI_RESERVE(2);
+ };
+ 
+ static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
+diff --git a/include/net/xfrm.h b/include/net/xfrm.h
+index c875faf98492..b9dec5f9c973 100644
+--- a/include/net/xfrm.h
++++ b/include/net/xfrm.h
+@@ -294,6 +294,8 @@ struct xfrm_state {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xs_net(struct xfrm_state *x)
+@@ -562,6 +564,8 @@ struct xfrm_policy {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xp_net(const struct xfrm_policy *xp)
+diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
+index 482647774bf5..a660cb68c853 100644
+--- a/include/uapi/linux/bpf.h
++++ b/include/uapi/linux/bpf.h
+@@ -6573,6 +6573,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
+diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
+index 96a9bd2c26f0..f5fb12890645 100644
+--- a/kernel/cgroup/cgroup-internal.h
++++ b/kernel/cgroup/cgroup-internal.h
+@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
+ 
+ /* iterate across the hierarchies */
+ #define for_each_root(root)						\
+-	list_for_each_entry((root), &cgroup_roots, root_list)
++	list_for_each_entry_rcu((root), &cgroup_roots, root_list,	\
++				lockdep_is_held(&cgroup_mutex))
+ 
+ /**
+  * for_each_subsys - iterate all enabled cgroup subsystems
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 52fe6ba2fefd..c26a9b3a3576 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
+ 
+ void cgroup_free_root(struct cgroup_root *root)
+ {
+-	kfree(root);
++	kfree_rcu(root, rcu);
+ }
+ 
+ static void cgroup_destroy_root(struct cgroup_root *root)
+@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
+ 	spin_unlock_irq(&css_set_lock);
+ 
+ 	if (!list_empty(&root->root_list)) {
+-		list_del(&root->root_list);
++		list_del_rcu(&root->root_list);
+ 		cgroup_root_count--;
+ 	}
+ 
+@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ 		}
+ 	}
+ 
+-	BUG_ON(!res_cgroup);
++	/*
++	 * If cgroup_mutex is not held, the cgrp_cset_link will be freed
++	 * before we remove the cgroup root from the root_list. Consequently,
++	 * when accessing a cgroup root, the cset_link may have already been
++	 * freed, resulting in a NULL res_cgroup. However, by holding the
++	 * cgroup_mutex, we ensure that res_cgroup can't be NULL.
++	 * If we don't hold cgroup_mutex in the caller, we must do the NULL
++	 * check.
++	 */
+ 	return res_cgroup;
+ }
+ 
+@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
+ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 					    struct cgroup_root *root)
+ {
+-	lockdep_assert_held(&cgroup_mutex);
+ 	lockdep_assert_held(&css_set_lock);
+ 
+ 	return __cset_cgroup_from_root(cset, root);
+@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 
+ /*
+  * Return the cgroup for "task" from the given hierarchy. Must be
+- * called with cgroup_mutex and css_set_lock held.
++ * called with css_set_lock held to prevent task's groups from being modified.
++ * Must be called with either cgroup_mutex or rcu read lock to prevent the
++ * cgroup root from being destroyed.
+  */
+ struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ 				     struct cgroup_root *root)
+@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
+ 	struct cgroup_root *root = ctx->root;
+ 	struct cgroup *cgrp = &root->cgrp;
+ 
+-	INIT_LIST_HEAD(&root->root_list);
++	INIT_LIST_HEAD_RCU(&root->root_list);
+ 	atomic_set(&root->nr_cgrps, 1);
+ 	cgrp->root = root;
+ 	init_cgroup_housekeeping(cgrp);
+@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+ 	 * care of subsystems' refcounts, which are explicitly dropped in
+ 	 * the failure exit path.
+ 	 */
+-	list_add(&root->root_list, &cgroup_roots);
++	list_add_rcu(&root->root_list, &cgroup_roots);
+ 	cgroup_root_count++;
+ 
+ 	/*
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 2c9e50f09fc1..7ea0a6d00519 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -21,6 +21,7 @@
+  *  License.  See the file COPYING in the main directory of the Linux
+  *  distribution for more details.
+  */
++#include "cgroup-internal.h"
+ 
+ #include <linux/cpu.h>
+ #include <linux/cpumask.h>
+@@ -210,11 +211,6 @@ struct cpuset {
+ 
+ 	/* Remote partition silbling list anchored at remote_children */
+ 	struct list_head remote_sibling;
+-
+-	KABI_RESERVE(1)
+-	KABI_RESERVE(2)
+-	KABI_RESERVE(3)
+-	KABI_RESERVE(4)
+ };
+ 
+ /*
+@@ -5185,40 +5181,20 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 	char *buf;
+ 	struct cgroup_subsys_state *css;
+ 	int retval;
+-	struct cgroup *root_cgroup = NULL;
+ 
+ 	retval = -ENOMEM;
+ 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ 	if (!buf)
+ 		goto out;
+ 
+-	css = task_get_css(tsk, cpuset_cgrp_id);
+ 	rcu_read_lock();
+-	/*
+-	 * When the cpuset subsystem is mounted on the legacy hierarchy,
+-	 * the top_cpuset.css->cgroup does not hold a reference count of
+-	 * cgroup_root.cgroup. This makes accessing css->cgroup very
+-	 * dangerous because when the cpuset subsystem is remounted to the
+-	 * default hierarchy, the cgroup_root.cgroup that css->cgroup points
+-	 * to will be released, leading to a UAF issue. To avoid this problem,
+-	 * get the reference count of top_cpuset.css->cgroup first.
+-	 *
+-	 * This is ugly!!
+-	 */
+-	if (css == &top_cpuset.css) {
+-		root_cgroup = css->cgroup;
+-		if (!css_tryget_online(&root_cgroup->self)) {
+-			rcu_read_unlock();
+-			retval = -EBUSY;
+-			goto out_free;
+-		}
+-	}
++	spin_lock_irq(&css_set_lock);
++	css = task_css(tsk, cpuset_cgrp_id);
++	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
++				       current->nsproxy->cgroup_ns);
++	spin_unlock_irq(&css_set_lock);
+ 	rcu_read_unlock();
+-	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+-				current->nsproxy->cgroup_ns);
+-	css_put(css);
+-	if (root_cgroup)
+-		css_put(&root_cgroup->self);
++
+ 	if (retval >= PATH_MAX)
+ 		retval = -ENAMETOOLONG;
+ 	if (retval < 0)
+diff --git a/kernel/events/internal.h b/kernel/events/internal.h
+index d2e6e6144c54..d1ffa00b91b6 100644
+--- a/kernel/events/internal.h
++++ b/kernel/events/internal.h
+@@ -5,6 +5,7 @@
+ #include <linux/hardirq.h>
+ #include <linux/uaccess.h>
+ #include <linux/refcount.h>
++#include <linux/kabi.h>
+ 
+ /* Buffer handling */
+ 
+@@ -54,6 +55,15 @@ struct perf_buffer {
+ 	void				**aux_pages;
+ 	void				*aux_priv;
+ 
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
++
+ 	struct perf_event_mmap_page	*user_page;
+ 	void				*data_pages[];
+ };
+diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
+index f9d145730fd1..03a810927d0a 100644
+--- a/mm/debug_page_alloc.c
++++ b/mm/debug_page_alloc.c
+@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
+ }
+ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+ 
+-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype)
++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
+ {
+ 	if (order >= debug_guardpage_minorder())
+ 		return false;
+@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ 	__SetPageGuard(page);
+ 	INIT_LIST_HEAD(&page->buddy_list);
+ 	set_page_private(page, order);
+-	/* Guard pages are not available for any usage */
+-	if (!is_migrate_isolate(migratetype))
+-		__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+ 
+ 	return true;
+ }
+ 
+-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype)
++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
+ {
+ 	__ClearPageGuard(page);
+-
+ 	set_page_private(page, 0);
+-	if (!is_migrate_isolate(migratetype))
+-		__mod_zone_freepage_state(zone, (1 << order), migratetype);
+ }
+diff --git a/mm/internal.h b/mm/internal.h
+index 0478e5dab55b..8742aafde387 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -693,10 +693,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ 		unsigned long, enum meminit_context, struct vmem_altmap *, int);
+ 
+-
+-int split_free_page(struct page *free_page,
+-			unsigned int order, unsigned long split_pfn_offset);
+-
+ #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ 
+ #define MAX_PAGE_ORDER	MAX_ORDER
+@@ -1175,11 +1171,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
+ 	return migratetype == MIGRATE_HIGHATOMIC;
+ }
+ 
+-static inline bool is_migrate_highatomic_page(struct page *page)
+-{
+-	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+-}
+-
+ void setup_zone_pageset(struct zone *zone);
+ 
+ struct migration_target_control {
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 36cd38df0614..7734245d7870 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -207,24 +207,6 @@ EXPORT_SYMBOL(node_states);
+ 
+ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+ 
+-/*
+- * A cached value of the page's pageblock's migratetype, used when the page is
+- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+- * Also the migratetype set in the page does not necessarily match the pcplist
+- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+- * other index - this ensures that it will be put on the correct CMA freelist.
+- */
+-static inline int get_pcppage_migratetype(struct page *page)
+-{
+-	return page->index;
+-}
+-
+-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+-{
+-	page->index = migratetype;
+-}
+-
+ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+ unsigned int pageblock_order __read_mostly;
+ #endif
+@@ -654,23 +636,38 @@ compaction_capture(struct capture_control *capc, struct page *page,
+ }
+ #endif /* CONFIG_COMPACTION */
+ 
+-/* Used for pages not on another list */
+-static inline void add_to_free_list(struct page *page, struct zone *zone,
+-				    unsigned int order, int migratetype)
++static inline void account_freepages(struct zone *zone, int nr_pages,
++				     int migratetype)
+ {
+-	struct free_area *area = &zone->free_area[order];
++	lockdep_assert_held(&zone->lock);
+ 
+-	list_add(&page->buddy_list, &area->free_list[migratetype]);
+-	area->nr_free++;
++	if (is_migrate_isolate(migratetype))
++		return;
++
++	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
++
++	if (is_migrate_cma(migratetype))
++		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
++	else if (is_migrate_highatomic(migratetype))
++		WRITE_ONCE(zone->nr_free_highatomic,
++			   zone->nr_free_highatomic + nr_pages);
+ }
+ 
+ /* Used for pages not on another list */
+-static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+-					 unsigned int order, int migratetype)
++static inline void __add_to_free_list(struct page *page, struct zone *zone,
++				      unsigned int order, int migratetype,
++				      bool tail)
+ {
+ 	struct free_area *area = &zone->free_area[order];
+ 
+-	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
++	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), migratetype, 1 << order);
++
++	if (tail)
++		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
++	else
++		list_add(&page->buddy_list, &area->free_list[migratetype]);
+ 	area->nr_free++;
+ }
+ 
+@@ -680,16 +677,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+  * allocation again (e.g., optimization for memory onlining).
+  */
+ static inline void move_to_free_list(struct page *page, struct zone *zone,
+-				     unsigned int order, int migratetype)
++				     unsigned int order, int old_mt, int new_mt)
+ {
+ 	struct free_area *area = &zone->free_area[order];
+ 
+-	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
++	/* Free page moving can fail, so it happens before the type update */
++	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), old_mt, 1 << order);
++
++	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
++
++	account_freepages(zone, -(1 << order), old_mt);
++	account_freepages(zone, 1 << order, new_mt);
+ }
+ 
+-static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+-					   unsigned int order)
++static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
++					     unsigned int order, int migratetype)
+ {
++        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), migratetype, 1 << order);
++
+ 	/* clear reported state and update reported page count */
+ 	if (page_reported(page))
+ 		__ClearPageReported(page);
+@@ -700,6 +709,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ 	zone->free_area[order].nr_free--;
+ }
+ 
++static inline void del_page_from_free_list(struct page *page, struct zone *zone,
++					   unsigned int order, int migratetype)
++{
++	__del_page_from_free_list(page, zone, order, migratetype);
++	account_freepages(zone, -(1 << order), migratetype);
++}
++
+ static inline struct page *get_page_from_free_area(struct free_area *area,
+ 					    int migratetype)
+ {
+@@ -771,16 +787,16 @@ static inline void __free_one_page(struct page *page,
+ 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ 
+ 	VM_BUG_ON(migratetype == -1);
+-	if (likely(!is_migrate_isolate(migratetype)))
+-		__mod_zone_freepage_state(zone, 1 << order, migratetype);
+-
+ 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
+ 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
+ 
++	account_freepages(zone, 1 << order, migratetype);
++
+ 	while (order < MAX_ORDER) {
++		int buddy_mt = migratetype;
++
+ 		if (compaction_capture(capc, page, order, migratetype)) {
+-			__mod_zone_freepage_state(zone, -(1 << order),
+-								migratetype);
++			account_freepages(zone, -(1 << order), migratetype);
+ 			return;
+ 		}
+ 
+@@ -795,11 +811,11 @@ static inline void __free_one_page(struct page *page,
+ 			 * pageblock isolation could cause incorrect freepage or CMA
+ 			 * accounting or HIGHATOMIC accounting.
+ 			 */
+-			int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
++			buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+ 
+-			if (migratetype != buddy_mt
+-					&& (!migratetype_is_mergeable(migratetype) ||
+-						!migratetype_is_mergeable(buddy_mt)))
++			if (migratetype != buddy_mt &&
++			    (!migratetype_is_mergeable(migratetype) ||
++			     !migratetype_is_mergeable(buddy_mt)))
+ 				goto done_merging;
+ 		}
+ 
+@@ -808,9 +824,19 @@ static inline void __free_one_page(struct page *page,
+ 		 * merge with it and move up one order.
+ 		 */
+ 		if (page_is_guard(buddy))
+-			clear_page_guard(zone, buddy, order, migratetype);
++			clear_page_guard(zone, buddy, order);
+ 		else
+-			del_page_from_free_list(buddy, zone, order);
++			__del_page_from_free_list(buddy, zone, order, buddy_mt);
++
++		if (unlikely(buddy_mt != migratetype)) {
++			/*
++			 * Match buddy type. This ensures that an
++			 * expand() down the line puts the sub-blocks
++			 * on the right freelists.
++			 */
++			set_pageblock_migratetype(buddy, migratetype);
++		}
++
+ 		combined_pfn = buddy_pfn & pfn;
+ 		page = page + (combined_pfn - pfn);
+ 		pfn = combined_pfn;
+@@ -827,74 +853,13 @@ static inline void __free_one_page(struct page *page,
+ 	else
+ 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+ 
+-	if (to_tail)
+-		add_to_free_list_tail(page, zone, order, migratetype);
+-	else
+-		add_to_free_list(page, zone, order, migratetype);
++	__add_to_free_list(page, zone, order, migratetype, to_tail);
+ 
+ 	/* Notify page reporting subsystem of freed page */
+ 	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
+ 		page_reporting_notify_free(order);
+ }
+ 
+-/**
+- * split_free_page() -- split a free page at split_pfn_offset
+- * @free_page:		the original free page
+- * @order:		the order of the page
+- * @split_pfn_offset:	split offset within the page
+- *
+- * Return -ENOENT if the free page is changed, otherwise 0
+- *
+- * It is used when the free page crosses two pageblocks with different migratetypes
+- * at split_pfn_offset within the page. The split free page will be put into
+- * separate migratetype lists afterwards. Otherwise, the function achieves
+- * nothing.
+- */
+-int split_free_page(struct page *free_page,
+-			unsigned int order, unsigned long split_pfn_offset)
+-{
+-	struct zone *zone = page_zone(free_page);
+-	unsigned long free_page_pfn = page_to_pfn(free_page);
+-	unsigned long pfn;
+-	unsigned long flags;
+-	int free_page_order;
+-	int mt;
+-	int ret = 0;
+-
+-	if (split_pfn_offset == 0)
+-		return ret;
+-
+-	spin_lock_irqsave(&zone->lock, flags);
+-
+-	if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+-		ret = -ENOENT;
+-		goto out;
+-	}
+-
+-	mt = get_pfnblock_migratetype(free_page, free_page_pfn);
+-	if (likely(!is_migrate_isolate(mt)))
+-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
+-
+-	del_page_from_free_list(free_page, zone, order);
+-	for (pfn = free_page_pfn;
+-	     pfn < free_page_pfn + (1UL << order);) {
+-		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+-
+-		free_page_order = min_t(unsigned int,
+-					pfn ? __ffs(pfn) : order,
+-					__fls(split_pfn_offset));
+-		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+-				mt, FPI_NONE);
+-		pfn += 1UL << free_page_order;
+-		split_pfn_offset -= (1UL << free_page_order);
+-		/* we have done the first part, now switch to second part */
+-		if (split_pfn_offset == 0)
+-			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+-	}
+-out:
+-	spin_unlock_irqrestore(&zone->lock, flags);
+-	return ret;
+-}
+ /*
+  * A bad page could be due to a number of fields. Instead of multiple branches,
+  * try and check multiple fields with one check. The caller must do a detailed
+@@ -1186,7 +1151,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ {
+ 	unsigned long flags;
+ 	unsigned int order;
+-	bool isolated_pageblocks;
+ 	struct page *page;
+ 
+ 	/*
+@@ -1199,7 +1163,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	pindex = pindex - 1;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+-	isolated_pageblocks = has_isolate_pageblock(zone);
+ 
+ 	while (count > 0) {
+ 		struct list_head *list;
+@@ -1215,23 +1178,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 		order = pindex_to_order(pindex);
+ 		nr_pages = 1 << order;
+ 		do {
++			unsigned long pfn;
+ 			int mt;
+ 
+ 			page = list_last_entry(list, struct page, pcp_list);
+-			mt = get_pcppage_migratetype(page);
++			pfn = page_to_pfn(page);
++			mt = get_pfnblock_migratetype(page, pfn);
+ 
+ 			/* must delete to avoid corrupting pcp list */
+ 			list_del(&page->pcp_list);
+ 			count -= nr_pages;
+ 			pcp->count -= nr_pages;
+ 
+-			/* MIGRATE_ISOLATE page should not go to pcplists */
+-			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+-			/* Pageblock could have been isolated meanwhile */
+-			if (unlikely(isolated_pageblocks))
+-				mt = get_pageblock_migratetype(page);
+-
+-			__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
++			__free_one_page(page, pfn, zone, order, mt, FPI_NONE);
+ 			trace_mm_page_pcpu_drain(page, order, mt);
+ 		} while (count > 0 && !list_empty(list));
+ 	}
+@@ -1239,18 +1198,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ 
+-static void free_one_page(struct zone *zone,
+-				struct page *page, unsigned long pfn,
+-				unsigned int order,
+-				int migratetype, fpi_t fpi_flags)
++static void free_one_page(struct zone *zone, struct page *page,
++			  unsigned long pfn, unsigned int order,
++			  fpi_t fpi_flags)
+ {
+ 	unsigned long flags;
++	int migratetype;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+-	if (unlikely(has_isolate_pageblock(zone) ||
+-		is_migrate_isolate(migratetype))) {
+-		migratetype = get_pfnblock_migratetype(page, pfn);
+-	}
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ }
+@@ -1258,28 +1214,13 @@ static void free_one_page(struct zone *zone,
+ static void __free_pages_ok(struct page *page, unsigned int order,
+ 			    fpi_t fpi_flags)
+ {
+-	unsigned long flags;
+-	int migratetype;
+ 	unsigned long pfn = page_to_pfn(page);
+ 	struct zone *zone = page_zone(page);
+ 
+ 	if (!free_pages_prepare(page, order))
+ 		return;
+ 
+-	/*
+-	 * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+-	 * is used to avoid calling get_pfnblock_migratetype() under the lock.
+-	 * This will reduce the lock holding time.
+-	 */
+-	migratetype = get_pfnblock_migratetype(page, pfn);
+-
+-	spin_lock_irqsave(&zone->lock, flags);
+-	if (unlikely(has_isolate_pageblock(zone) ||
+-		is_migrate_isolate(migratetype))) {
+-		migratetype = get_pfnblock_migratetype(page, pfn);
+-	}
+-	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+-	spin_unlock_irqrestore(&zone->lock, flags);
++	free_one_page(zone, page, pfn, order, fpi_flags);
+ 
+ 	__count_vm_events(PGFREE, 1 << order);
+ }
+@@ -1386,10 +1327,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+  *
+  * -- nyc
+  */
+-static inline void expand(struct zone *zone, struct page *page,
+-	int low, int high, int migratetype)
++static inline unsigned int expand(struct zone *zone, struct page *page, int low,
++				  int high, int migratetype)
+ {
+-	unsigned long size = 1 << high;
++	unsigned int size = 1 << high;
++	unsigned int nr_added = 0;
+ 
+ 	while (high > low) {
+ 		high--;
+@@ -1402,12 +1344,26 @@ static inline void expand(struct zone *zone, struct page *page,
+ 		 * Corresponding page table entries will not be touched,
+ 		 * pages will stay not present in virtual address space
+ 		 */
+-		if (set_page_guard(zone, &page[size], high, migratetype))
++		if (set_page_guard(zone, &page[size], high))
+ 			continue;
+ 
+-		add_to_free_list(&page[size], zone, high, migratetype);
++		__add_to_free_list(&page[size], zone, high, migratetype, false);
+ 		set_buddy_order(&page[size], high);
++		nr_added += size;
+ 	}
++
++	return nr_added;
++}
++
++static __always_inline void page_del_and_expand(struct zone *zone,
++						struct page *page, int low,
++						int high, int migratetype)
++{
++	int nr_pages = 1 << high;
++
++	__del_page_from_free_list(page, zone, high, migratetype);
++	nr_pages -= expand(zone, page, low, high, migratetype);
++	account_freepages(zone, -nr_pages, migratetype);
+ }
+ 
+ static void check_new_page_bad(struct page *page)
+@@ -1596,9 +1552,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ 		page = get_page_from_free_area(area, migratetype);
+ 		if (!page)
+ 			continue;
+-		del_page_from_free_list(page, zone, current_order);
+-		expand(zone, page, order, current_order, migratetype);
+-		set_pcppage_migratetype(page, migratetype);
++
++		page_del_and_expand(zone, page, order, current_order,
++				    migratetype);
+ 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ 				pcp_allowed_order(order) &&
+ 				migratetype < MIGRATE_PCPTYPES);
+@@ -1633,30 +1589,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ #endif
+ 
+ /*
+- * Move the free pages in a range to the freelist tail of the requested type.
+- * Note that start_page and end_pages are not aligned on a pageblock
+- * boundary. If alignment is required, use move_freepages_block()
++ * Change the type of a block and move all its free pages to that
++ * type's freelist.
+  */
+-static int move_freepages(struct zone *zone,
+-			  unsigned long start_pfn, unsigned long end_pfn,
+-			  int migratetype, int *num_movable)
++static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
++				  int old_mt, int new_mt)
+ {
+ 	struct page *page;
+-	unsigned long pfn;
++	unsigned long pfn, end_pfn;
+ 	unsigned int order;
+ 	int pages_moved = 0;
+ 
+-	for (pfn = start_pfn; pfn <= end_pfn;) {
++	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
++	end_pfn = pageblock_end_pfn(start_pfn);
++
++	for (pfn = start_pfn; pfn < end_pfn;) {
+ 		page = pfn_to_page(pfn);
+ 		if (!PageBuddy(page)) {
+-			/*
+-			 * We assume that pages that could be isolated for
+-			 * migration are movable. But we don't actually try
+-			 * isolating, as that would be expensive.
+-			 */
+-			if (num_movable &&
+-					(PageLRU(page) || __PageMovable(page)))
+-				(*num_movable)++;
+ 			pfn++;
+ 			continue;
+ 		}
+@@ -1666,35 +1615,186 @@ static int move_freepages(struct zone *zone,
+ 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ 
+ 		order = buddy_order(page);
+-		move_to_free_list(page, zone, order, migratetype);
++
++		move_to_free_list(page, zone, order, old_mt, new_mt);
++
+ 		pfn += 1 << order;
+ 		pages_moved += 1 << order;
+ 	}
+ 
++	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
++
+ 	return pages_moved;
+ }
+ 
+-int move_freepages_block(struct zone *zone, struct page *page,
+-				int migratetype, int *num_movable)
++static bool prep_move_freepages_block(struct zone *zone, struct page *page,
++				      unsigned long *start_pfn,
++				      int *num_free, int *num_movable)
+ {
+-	unsigned long start_pfn, end_pfn, pfn;
++	unsigned long pfn, start, end;
++
++	pfn = page_to_pfn(page);
++	start = pageblock_start_pfn(pfn);
++	end = pageblock_end_pfn(pfn);
++
++	/*
++	 * The caller only has the lock for @zone, don't touch ranges
++	 * that straddle into other zones. While we could move part of
++	 * the range that's inside the zone, this call is usually
++	 * accompanied by other operations such as migratetype updates
++	 * which also should be locked.
++	 */
++	if (!zone_spans_pfn(zone, start))
++		return false;
++	if (!zone_spans_pfn(zone, end - 1))
++		return false;
++
++	*start_pfn = start;
+ 
+-	if (num_movable)
++	if (num_free) {
++		*num_free = 0;
+ 		*num_movable = 0;
++		for (pfn = start; pfn < end;) {
++			page = pfn_to_page(pfn);
++			if (PageBuddy(page)) {
++				int nr = 1 << buddy_order(page);
+ 
+-	pfn = page_to_pfn(page);
+-	start_pfn = pageblock_start_pfn(pfn);
+-	end_pfn = pageblock_end_pfn(pfn) - 1;
++				*num_free += nr;
++				pfn += nr;
++				continue;
++			}
++			/*
++			 * We assume that pages that could be isolated for
++			 * migration are movable. But we don't actually try
++			 * isolating, as that would be expensive.
++			 */
++			if (PageLRU(page) || __PageMovable(page))
++				(*num_movable)++;
++			pfn++;
++		}
++	}
+ 
+-	/* Do not cross zone boundaries */
+-	if (!zone_spans_pfn(zone, start_pfn))
+-		start_pfn = pfn;
+-	if (!zone_spans_pfn(zone, end_pfn))
+-		return 0;
++	return true;
++}
++
++static int move_freepages_block(struct zone *zone, struct page *page,
++				int old_mt, int new_mt)
++{
++	unsigned long start_pfn;
+ 
+-	return move_freepages(zone, start_pfn, end_pfn, migratetype,
+-								num_movable);
++	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
++		return -1;
++
++	return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
++}
++
++#ifdef CONFIG_MEMORY_ISOLATION
++/* Look for a buddy that straddles start_pfn */
++static unsigned long find_large_buddy(unsigned long start_pfn)
++{
++	int order = 0;
++	struct page *page;
++	unsigned long pfn = start_pfn;
++
++	while (!PageBuddy(page = pfn_to_page(pfn))) {
++		/* Nothing found */
++		if (++order > MAX_PAGE_ORDER)
++			return start_pfn;
++		pfn &= ~0UL << order;
++	}
++
++	/*
++	 * Found a preceding buddy, but does it straddle?
++	 */
++	if (pfn + (1 << buddy_order(page)) > start_pfn)
++		return pfn;
++
++	/* Nothing found */
++	return start_pfn;
++}
++
++/* Split a multi-block free page into its individual pageblocks */
++static void split_large_buddy(struct zone *zone, struct page *page,
++			      unsigned long pfn, int order)
++{
++	unsigned long end_pfn = pfn + (1 << order);
++
++	VM_WARN_ON_ONCE(order <= pageblock_order);
++	VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
++
++	/* Caller removed page from freelist, buddy info cleared! */
++	VM_WARN_ON_ONCE(PageBuddy(page));
++
++	while (pfn != end_pfn) {
++		int mt = get_pfnblock_migratetype(page, pfn);
++
++		__free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
++		pfn += pageblock_nr_pages;
++		page = pfn_to_page(pfn);
++	}
++}
++
++/**
++ * move_freepages_block_isolate - move free pages in block for page isolation
++ * @zone: the zone
++ * @page: the pageblock page
++ * @migratetype: migratetype to set on the pageblock
++ *
++ * This is similar to move_freepages_block(), but handles the special
++ * case encountered in page isolation, where the block of interest
++ * might be part of a larger buddy spanning multiple pageblocks.
++ *
++ * Unlike the regular page allocator path, which moves pages while
++ * stealing buddies off the freelist, page isolation is interested in
++ * arbitrary pfn ranges that may have overlapping buddies on both ends.
++ *
++ * This function handles that. Straddling buddies are split into
++ * individual pageblocks. Only the block of interest is moved.
++ *
++ * Returns %true if pages could be moved, %false otherwise.
++ */
++bool move_freepages_block_isolate(struct zone *zone, struct page *page,
++				  int migratetype)
++{
++	unsigned long start_pfn, pfn;
++
++	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
++		return false;
++
++	/* No splits needed if buddies can't span multiple blocks */
++	if (pageblock_order == MAX_PAGE_ORDER)
++		goto move;
++
++	/* We're a tail block in a larger buddy */
++	pfn = find_large_buddy(start_pfn);
++	if (pfn != start_pfn) {
++		struct page *buddy = pfn_to_page(pfn);
++		int order = buddy_order(buddy);
++
++		del_page_from_free_list(buddy, zone, order,
++					get_pfnblock_migratetype(buddy, pfn));
++		set_pageblock_migratetype(page, migratetype);
++		split_large_buddy(zone, buddy, pfn, order);
++		return true;
++	}
++
++	/* We're the starting block of a larger buddy */
++	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
++		int order = buddy_order(page);
++
++		del_page_from_free_list(page, zone, order,
++					get_pfnblock_migratetype(page, pfn));
++		set_pageblock_migratetype(page, migratetype);
++		split_large_buddy(zone, page, pfn, order);
++		return true;
++	}
++move:
++	__move_freepages_block(zone, start_pfn,
++			       get_pfnblock_migratetype(page, start_pfn),
++			       migratetype);
++	return true;
+ }
++#endif /* CONFIG_MEMORY_ISOLATION */
+ 
+ static void change_pageblock_range(struct page *pageblock_page,
+ 					int start_order, int migratetype)
+@@ -1778,33 +1878,40 @@ static inline bool boost_watermark(struct zone *zone)
+ }
+ 
+ /*
+- * This function implements actual steal behaviour. If order is large enough,
+- * we can steal whole pageblock. If not, we first move freepages in this
+- * pageblock to our migratetype and determine how many already-allocated pages
+- * are there in the pageblock with a compatible migratetype. If at least half
+- * of pages are free or compatible, we can change migratetype of the pageblock
+- * itself, so pages freed in the future will be put on the correct free list.
++ * This function implements actual steal behaviour. If order is large enough, we
++ * can claim the whole pageblock for the requested migratetype. If not, we check
++ * the pageblock for constituent pages; if at least half of the pages are free
++ * or compatible, we can still claim the whole block, so pages freed in the
++ * future will be put on the correct free list. Otherwise, we isolate exactly
++ * the order we need from the fallback block and leave its migratetype alone.
+  */
+-static void steal_suitable_fallback(struct zone *zone, struct page *page,
+-		unsigned int alloc_flags, int start_type, bool whole_block)
++static struct page *
++steal_suitable_fallback(struct zone *zone, struct page *page,
++			int current_order, int order, int start_type,
++			unsigned int alloc_flags, bool whole_block)
+ {
+-	unsigned int current_order = buddy_order(page);
+ 	int free_pages, movable_pages, alike_pages;
+-	int old_block_type;
++	unsigned long start_pfn;
++	int block_type;
+ 
+-	old_block_type = get_pageblock_migratetype(page);
++	block_type = get_pageblock_migratetype(page);
+ 
+ 	/*
+ 	 * This can happen due to races and we want to prevent broken
+ 	 * highatomic accounting.
+ 	 */
+-	if (is_migrate_highatomic(old_block_type))
++	if (is_migrate_highatomic(block_type))
+ 		goto single_page;
+ 
+ 	/* Take ownership for orders >= pageblock_order */
+ 	if (current_order >= pageblock_order) {
++		unsigned int nr_added;
++
++		del_page_from_free_list(page, zone, current_order, block_type);
+ 		change_pageblock_range(page, current_order, start_type);
+-		goto single_page;
++		nr_added = expand(zone, page, order, current_order, start_type);
++		account_freepages(zone, nr_added, start_type);
++		return page;
+ 	}
+ 
+ 	/*
+@@ -1819,10 +1926,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	if (!whole_block)
+ 		goto single_page;
+ 
+-	free_pages = move_freepages_block(zone, page, start_type,
+-						&movable_pages);
+ 	/* moving whole block can fail due to zone boundary conditions */
+-	if (!free_pages)
++	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
++				       &movable_pages))
+ 		goto single_page;
+ 
+ 	/*
+@@ -1840,7 +1946,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 		 * vice versa, be conservative since we can't distinguish the
+ 		 * exact migratetype of non-movable pages.
+ 		 */
+-		if (old_block_type == MIGRATE_MOVABLE)
++		if (block_type == MIGRATE_MOVABLE)
+ 			alike_pages = pageblock_nr_pages
+ 						- (free_pages + movable_pages);
+ 		else
+@@ -1851,13 +1957,14 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	 * compatible migratability as our allocation, claim the whole block.
+ 	 */
+ 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+-			page_group_by_mobility_disabled)
+-		set_pageblock_migratetype(page, start_type);
+-
+-	return;
++			page_group_by_mobility_disabled) {
++		__move_freepages_block(zone, start_pfn, block_type, start_type);
++		return __rmqueue_smallest(zone, order, start_type);
++	}
+ 
+ single_page:
+-	move_to_free_list(page, zone, current_order, start_type);
++	page_del_and_expand(zone, page, order, current_order, block_type);
++	return page;
+ }
+ 
+ /*
+@@ -1895,10 +2002,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
+ }
+ 
+ /*
+- * Reserve a pageblock for exclusive use of high-order atomic allocations if
+- * there are no empty page blocks that contain a page with a suitable order
++ * Reserve the pageblock(s) surrounding an allocation request for
++ * exclusive use of high-order atomic allocations if there are no
++ * empty page blocks that contain a page with a suitable order
+  */
+-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
++static void reserve_highatomic_pageblock(struct page *page, int order,
++					 struct zone *zone)
+ {
+ 	int mt;
+ 	unsigned long max_managed, flags;
+@@ -1924,10 +2033,16 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+ 	/* Yoink! */
+ 	mt = get_pageblock_migratetype(page);
+ 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+-	if (migratetype_is_mergeable(mt)) {
++	if (!migratetype_is_mergeable(mt))
++		goto out_unlock;
++
++	if (order < pageblock_order) {
++		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
++			goto out_unlock;
+ 		zone->nr_reserved_highatomic += pageblock_nr_pages;
+-		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
++	} else {
++		change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
++		zone->nr_reserved_highatomic += 1 << order;
+ 	}
+ 
+ out_unlock:
+@@ -1940,7 +2055,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+  * intense memory pressure but failed atomic allocations should be easier
+  * to recover from than an OOM.
+  *
+- * If @force is true, try to unreserve a pageblock even though highatomic
++ * If @force is true, try to unreserve pageblocks even though highatomic
+  * pageblock is exhausted.
+  */
+ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+@@ -1952,7 +2067,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 	struct zone *zone;
+ 	struct page *page;
+ 	int order;
+-	bool ret;
++	int ret;
+ 
+ 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ 								ac->nodemask) {
+@@ -1967,11 +2082,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 		spin_lock_irqsave(&zone->lock, flags);
+ 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ 			struct free_area *area = &(zone->free_area[order]);
++			int mt;
+ 
+ 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ 			if (!page)
+ 				continue;
+ 
++			mt = get_pageblock_migratetype(page);
+ 			/*
+ 			 * In page freeing path, migratetype change is racy so
+ 			 * we can counter several free pages in a pageblock
+@@ -1979,7 +2096,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * from highatomic to ac->migratetype. So we should
+ 			 * adjust the count once.
+ 			 */
+-			if (is_migrate_highatomic_page(page)) {
++			if (is_migrate_highatomic(mt)) {
++				unsigned long size;
+ 				/*
+ 				 * It should never happen but changes to
+ 				 * locking could inadvertently allow a per-cpu
+@@ -1987,9 +2105,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 				 * while unreserving so be safe and watch for
+ 				 * underflows.
+ 				 */
+-				zone->nr_reserved_highatomic -= min(
+-						pageblock_nr_pages,
+-						zone->nr_reserved_highatomic);
++				size = max(pageblock_nr_pages, 1UL << order);
++				size = min(size, zone->nr_reserved_highatomic);
++				zone->nr_reserved_highatomic -= size;
+ 			}
+ 
+ 			/*
+@@ -2001,10 +2119,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * of pageblocks that cannot be completely freed
+ 			 * may increase.
+ 			 */
+-			set_pageblock_migratetype(page, ac->migratetype);
+-			ret = move_freepages_block(zone, page, ac->migratetype,
+-									NULL);
+-			if (ret) {
++			if (order < pageblock_order)
++				ret = move_freepages_block(zone, page, mt,
++							   ac->migratetype);
++			else {
++				move_to_free_list(page, zone, order, mt,
++						  ac->migratetype);
++				change_pageblock_range(page, order,
++						       ac->migratetype);
++				ret = 1;
++			}
++			/*
++			 * Reserving the block(s) already succeeded,
++			 * so this should not fail on zone boundaries.
++			 */
++			WARN_ON_ONCE(ret == -1);
++			if (ret > 0) {
+ 				spin_unlock_irqrestore(&zone->lock, flags);
+ 				return ret;
+ 			}
+@@ -2025,7 +2155,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+  * deviation from the rest of this file, to make the for loop
+  * condition simpler.
+  */
+-static __always_inline bool
++static __always_inline struct page *
+ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ 						unsigned int alloc_flags)
+ {
+@@ -2072,7 +2202,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ 		goto do_steal;
+ 	}
+ 
+-	return false;
++	return NULL;
+ 
+ find_smallest:
+ 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
+@@ -2092,14 +2222,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ do_steal:
+ 	page = get_page_from_free_area(area, fallback_mt);
+ 
+-	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+-								can_steal);
++	/* take off list, maybe claim block, expand remainder */
++	page = steal_suitable_fallback(zone, page, current_order, order,
++				       start_migratetype, alloc_flags, can_steal);
+ 
+ 	trace_mm_page_alloc_extfrag(page, order, current_order,
+ 		start_migratetype, fallback_mt);
+ 
+-	return true;
+-
++	return page;
+ }
+ 
+ /*
+@@ -2126,15 +2256,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ 				return page;
+ 		}
+ 	}
+-retry:
++
+ 	page = __rmqueue_smallest(zone, order, migratetype);
+ 	if (unlikely(!page)) {
+ 		if (alloc_flags & ALLOC_CMA)
+ 			page = __rmqueue_cma_fallback(zone, order);
+ 
+-		if (!page && __rmqueue_fallback(zone, order, migratetype,
+-								alloc_flags))
+-			goto retry;
++		if (!page)
++			page = __rmqueue_fallback(zone, order, migratetype,
++						  alloc_flags);
+ 	}
+ 	return page;
+ }
+@@ -2169,12 +2299,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * pages are ordered properly.
+ 		 */
+ 		list_add_tail(&page->pcp_list, list);
+-		if (is_migrate_cma(get_pcppage_migratetype(page)))
+-			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+-					      -(1 << order));
+ 	}
+-
+-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+ 	return i;
+@@ -2369,19 +2494,6 @@ void drain_all_pages(struct zone *zone)
+ 	__drain_all_pages(zone, false);
+ }
+ 
+-static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+-							unsigned int order)
+-{
+-	int migratetype;
+-
+-	if (!free_pages_prepare(page, order))
+-		return false;
+-
+-	migratetype = get_pfnblock_migratetype(page, pfn);
+-	set_pcppage_migratetype(page, migratetype);
+-	return true;
+-}
+-
+ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
+ {
+ 	int min_nr_free, max_nr_free;
+@@ -2512,7 +2624,7 @@ void free_unref_page(struct page *page, unsigned int order)
+ 	struct per_cpu_pages *pcp;
+ 	struct zone *zone;
+ 	unsigned long pfn = page_to_pfn(page);
+-	int migratetype, pcpmigratetype;
++	int migratetype;
+ 
+ 	if (page_from_dynamic_pool(page)) {
+ 		dynamic_pool_free_page(page);
+@@ -2524,7 +2636,7 @@ void free_unref_page(struct page *page, unsigned int order)
+ 		return;
+ 	}
+ 
+-	if (!free_unref_page_prepare(page, pfn, order))
++	if (!free_pages_prepare(page, order))
+ 		return;
+ 
+ 	/*
+@@ -2534,23 +2646,23 @@ void free_unref_page(struct page *page, unsigned int order)
+ 	 * get those areas back if necessary. Otherwise, we may have to free
+ 	 * excessively into the page allocator
+ 	 */
+-	migratetype = pcpmigratetype = get_pcppage_migratetype(page);
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ 		if (unlikely(is_migrate_isolate(migratetype))) {
+-			free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
++			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ 			return;
+ 		}
+-		pcpmigratetype = MIGRATE_MOVABLE;
++		migratetype = MIGRATE_MOVABLE;
+ 	}
+ 
+ 	zone = page_zone(page);
+ 	pcp_trylock_prepare(UP_flags);
+ 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ 	if (pcp) {
+-		free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
++		free_unref_page_commit(zone, pcp, page, migratetype, order);
+ 		pcp_spin_unlock(pcp);
+ 	} else {
+-		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
++		free_one_page(zone, page, pfn, order, FPI_NONE);
+ 	}
+ 	pcp_trylock_finish(UP_flags);
+ }
+@@ -2563,7 +2675,7 @@ void free_unref_folios(struct folio_batch *folios)
+ 	unsigned long __maybe_unused UP_flags;
+ 	struct per_cpu_pages *pcp = NULL;
+ 	struct zone *locked_zone = NULL;
+-	int i, j, migratetype;
++	int i, j;
+ 
+ 	/* Prepare folios for freeing */
+ 	for (i = 0, j = 0; i < folios->nr; i++) {
+@@ -2577,18 +2689,15 @@ void free_unref_folios(struct folio_batch *folios)
+ 		}
+ 
+ 		folio_undo_large_rmappable(folio);
+-		if (!free_unref_page_prepare(&folio->page, pfn, order))
++		if (!free_pages_prepare(&folio->page, order))
+ 			continue;
+-
+ 		/*
+-		 * Free isolated folios and orders not handled on the PCP
+-		 * directly to the allocator, see comment in free_unref_page.
++		 * Free orders not handled on the PCP directly to the
++		 * allocator.
+ 		 */
+-		migratetype = get_pcppage_migratetype(&folio->page);
+-		if (!pcp_allowed_order(order) ||
+-		    is_migrate_isolate(migratetype)) {
+-			free_one_page(folio_zone(folio), &folio->page, pfn,
+-					order, migratetype, FPI_NONE);
++		if (!pcp_allowed_order(order)) {
++			free_one_page(folio_zone(folio), &folio->page,
++				      pfn, order, FPI_NONE);
+ 			continue;
+ 		}
+ 		folio->private = (void *)(unsigned long)order;
+@@ -2601,16 +2710,31 @@ void free_unref_folios(struct folio_batch *folios)
+ 	for (i = 0; i < folios->nr; i++) {
+ 		struct folio *folio = folios->folios[i];
+ 		struct zone *zone = folio_zone(folio);
++		unsigned long pfn = folio_pfn(folio);
+ 		unsigned int order = (unsigned long)folio->private;
++		int migratetype;
+ 
+ 		folio->private = NULL;
+-		migratetype = get_pcppage_migratetype(&folio->page);
++		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
+ 
+ 		/* Different zone requires a different pcp lock */
+-		if (zone != locked_zone) {
++		if (zone != locked_zone ||
++		    is_migrate_isolate(migratetype)) {
+ 			if (pcp) {
+ 				pcp_spin_unlock(pcp);
+ 				pcp_trylock_finish(UP_flags);
++				locked_zone = NULL;
++				pcp = NULL;
++			}
++
++			/*
++			 * Free isolated pages directly to the
++			 * allocator, see comment in free_unref_page.
++			 */
++			if (is_migrate_isolate(migratetype)) {
++				free_one_page(zone, &folio->page, pfn,
++					      order, FPI_NONE);
++				continue;
+ 			}
+ 
+ 			/*
+@@ -2621,10 +2745,8 @@ void free_unref_folios(struct folio_batch *folios)
+ 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ 			if (unlikely(!pcp)) {
+ 				pcp_trylock_finish(UP_flags);
+-				free_one_page(zone, &folio->page,
+-						folio_pfn(folio), order,
+-						migratetype, FPI_NONE);
+-				locked_zone = NULL;
++				free_one_page(zone, &folio->page, pfn,
++					      order, FPI_NONE);
+ 				continue;
+ 			}
+ 			locked_zone = zone;
+@@ -2687,11 +2809,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
+ 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ 			return 0;
+-
+-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
+ 	}
+ 
+-	del_page_from_free_list(page, zone, order);
++	del_page_from_free_list(page, zone, order, mt);
+ 
+ 	/*
+ 	 * Set the pageblock if the isolated page is at least half of a
+@@ -2706,8 +2826,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 			 * with others)
+ 			 */
+ 			if (migratetype_is_mergeable(mt))
+-				set_pageblock_migratetype(page,
+-							  MIGRATE_MOVABLE);
++				move_freepages_block(zone, page, mt,
++						     MIGRATE_MOVABLE);
+ 		}
+ 	}
+ 
+@@ -2791,8 +2911,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ 				return NULL;
+ 			}
+ 		}
+-		__mod_zone_freepage_state(zone, -(1 << order),
+-					  get_pcppage_migratetype(page));
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 	} while (check_new_pages(page, order));
+ 
+@@ -2974,11 +3092,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
+ 
+ 	/*
+ 	 * If the caller does not have rights to reserves below the min
+-	 * watermark then subtract the high-atomic reserves. This will
+-	 * over-estimate the size of the atomic reserve but it avoids a search.
++	 * watermark then subtract the free pages reserved for highatomic.
+ 	 */
+ 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
+-		unusable_free += z->nr_reserved_highatomic;
++		unusable_free += READ_ONCE(z->nr_free_highatomic);
+ 
+ #ifdef CONFIG_CMA
+ 	/* If allocation can't use CMA areas don't use free CMA pages */
+@@ -3360,7 +3477,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+ 			 * if the pageblock should be reserved for the future
+ 			 */
+ 			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
+-				reserve_highatomic_pageblock(page, zone);
++				reserve_highatomic_pageblock(page, order, zone);
+ 
+ 			return page;
+ 		} else {
+@@ -6570,7 +6687,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ 		       unsigned migratetype, gfp_t gfp_mask)
+ {
+ 	unsigned long outer_start, outer_end;
+-	int order;
+ 	int ret = 0;
+ 
+ 	struct compact_control cc = {
+@@ -6643,29 +6759,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ 	 * We don't have to hold zone->lock here because the pages are
+ 	 * isolated thus they won't get removed from buddy.
+ 	 */
+-
+-	order = 0;
+-	outer_start = start;
+-	while (!PageBuddy(pfn_to_page(outer_start))) {
+-		if (++order > MAX_ORDER) {
+-			outer_start = start;
+-			break;
+-		}
+-		outer_start &= ~0UL << order;
+-	}
+-
+-	if (outer_start != start) {
+-		order = buddy_order(pfn_to_page(outer_start));
+-
+-		/*
+-		 * outer_start page could be small order buddy page and
+-		 * it doesn't include start page. Adjust outer_start
+-		 * in this case to report failed page properly
+-		 * on tracepoint in test_pages_isolated()
+-		 */
+-		if (outer_start + (1UL << order) <= start)
+-			outer_start = start;
+-	}
++	outer_start = find_large_buddy(start);
+ 
+ 	/* Make sure the range is really isolated. */
+ 	if (test_pages_isolated(outer_start, end, 0)) {
+@@ -6899,8 +6993,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+ 
+ 		BUG_ON(page_count(page));
+ 		BUG_ON(!PageBuddy(page));
++		VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
+ 		order = buddy_order(page);
+-		del_page_from_free_list(page, zone, order);
++		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
+ 		pfn += (1 << order);
+ 	}
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+@@ -6928,6 +7023,14 @@ bool is_free_buddy_page(struct page *page)
+ EXPORT_SYMBOL(is_free_buddy_page);
+ 
+ #ifdef CONFIG_MEMORY_FAILURE
++static inline void add_to_free_list(struct page *page, struct zone *zone,
++				    unsigned int order, int migratetype,
++				    bool tail)
++{
++	__add_to_free_list(page, zone, order, migratetype, tail);
++	account_freepages(zone, 1 << order, migratetype);
++}
++
+ /*
+  * Break down a higher-order page in sub-pages, and keep our target out of
+  * buddy allocator.
+@@ -6937,28 +7040,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ 				   int migratetype)
+ {
+ 	unsigned long size = 1 << high;
+-	struct page *current_buddy, *next_page;
++	struct page *current_buddy;
+ 
+ 	while (high > low) {
+ 		high--;
+ 		size >>= 1;
+ 
+ 		if (target >= &page[size]) {
+-			next_page = page + size;
+ 			current_buddy = page;
++			page = page + size;
+ 		} else {
+-			next_page = page;
+ 			current_buddy = page + size;
+ 		}
+-		page = next_page;
+ 
+-		if (set_page_guard(zone, current_buddy, high, migratetype))
++		if (set_page_guard(zone, current_buddy, high))
+ 			continue;
+ 
+-		if (current_buddy != target) {
+-			add_to_free_list(current_buddy, zone, high, migratetype);
+-			set_buddy_order(current_buddy, high);
+-		}
++		add_to_free_list(current_buddy, zone, high, migratetype, false);
++		set_buddy_order(current_buddy, high);
+ 	}
+ }
+ 
+@@ -6983,12 +7082,11 @@ bool take_page_off_buddy(struct page *page)
+ 			int migratetype = get_pfnblock_migratetype(page_head,
+ 								   pfn_head);
+ 
+-			del_page_from_free_list(page_head, zone, page_order);
++			del_page_from_free_list(page_head, zone, page_order,
++						migratetype);
+ 			break_down_buddy_pages(zone, page_head, page, 0,
+ 						page_order, migratetype);
+ 			SetPageHWPoisonTakenOff(page);
+-			if (!is_migrate_isolate(migratetype))
+-				__mod_zone_freepage_state(zone, -1, migratetype);
+ 			ret = true;
+ 			break;
+ 		}
+@@ -7005,13 +7103,14 @@ bool take_page_off_buddy(struct page *page)
+ bool put_page_back_buddy(struct page *page)
+ {
+ 	struct zone *zone = page_zone(page);
+-	unsigned long pfn = page_to_pfn(page);
+ 	unsigned long flags;
+-	int migratetype = get_pfnblock_migratetype(page, pfn);
+ 	bool ret = false;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	if (put_page_testzero(page)) {
++		unsigned long pfn = page_to_pfn(page);
++		int migratetype = get_pfnblock_migratetype(page, pfn);
++
+ 		ClearPageHWPoisonTakenOff(page);
+ 		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
+ 		if (TestClearPageHWPoison(page)) {
+@@ -7092,7 +7191,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
+ 	list_del(&page->lru);
+ 	last = list_empty(&zone->unaccepted_pages);
+ 
+-	__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
++	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+@@ -7150,7 +7249,7 @@ static bool __free_unaccepted(struct page *page)
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	first = list_empty(&zone->unaccepted_pages);
+ 	list_add_tail(&page->lru, &zone->unaccepted_pages);
+-	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
++	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+diff --git a/mm/page_isolation.c b/mm/page_isolation.c
+index 03381be87b28..cf7f1922fc3e 100644
+--- a/mm/page_isolation.c
++++ b/mm/page_isolation.c
+@@ -179,15 +179,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ 			migratetype, isol_flags);
+ 	if (!unmovable) {
+-		unsigned long nr_pages;
+-		int mt = get_pageblock_migratetype(page);
+-
+-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
++		if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
++			spin_unlock_irqrestore(&zone->lock, flags);
++			return -EBUSY;
++		}
+ 		zone->nr_isolate_pageblock++;
+-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+-									NULL);
+-
+-		__mod_zone_freepage_state(zone, -nr_pages, mt);
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 		return 0;
+ 	}
+@@ -207,7 +203,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ {
+ 	struct zone *zone;
+-	unsigned long flags, nr_pages;
++	unsigned long flags;
+ 	bool isolated_page = false;
+ 	unsigned int order;
+ 	struct page *buddy;
+@@ -253,12 +249,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ 	 * allocation.
+ 	 */
+ 	if (!isolated_page) {
+-		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
+-		__mod_zone_freepage_state(zone, nr_pages, migratetype);
+-	}
+-	set_pageblock_migratetype(page, migratetype);
+-	if (isolated_page)
++		/*
++		 * Isolating this block already succeeded, so this
++		 * should not fail on zone boundaries.
++		 */
++		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
++	} else {
++		set_pageblock_migratetype(page, migratetype);
+ 		__putback_isolated_page(page, order, migratetype);
++	}
+ 	zone->nr_isolate_pageblock--;
+ out:
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+@@ -370,108 +369,52 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 
+ 		VM_BUG_ON(!page);
+ 		pfn = page_to_pfn(page);
+-		/*
+-		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+-		 * free pages in [start_pfn, boundary_pfn), its head page will
+-		 * always be in the range.
+-		 */
++
+ 		if (PageBuddy(page)) {
+ 			int order = buddy_order(page);
+ 
+-			if (pfn + (1UL << order) > boundary_pfn) {
+-				/* free page changed before split, check it again */
+-				if (split_free_page(page, order, boundary_pfn - pfn))
+-					continue;
+-			}
++			/* move_freepages_block_isolate() handled this */
++			VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
+ 
+ 			pfn += 1UL << order;
+ 			continue;
+ 		}
++
+ 		/*
+-		 * migrate compound pages then let the free page handling code
+-		 * above do the rest. If migration is not possible, just fail.
++		 * If a compound page is straddling our block, attempt
++		 * to migrate it out of the way.
++		 *
++		 * We don't have to worry about this creating a large
++		 * free page that straddles into our block: gigantic
++		 * pages are freed as order-0 chunks, and LRU pages
++		 * (currently) do not exceed pageblock_order.
++		 *
++		 * The block of interest has already been marked
++		 * MIGRATE_ISOLATE above, so when migration is done it
++		 * will free its pages onto the correct freelists.
+ 		 */
+ 		if (PageCompound(page)) {
+ 			struct page *head = compound_head(page);
+ 			unsigned long head_pfn = page_to_pfn(head);
+ 			unsigned long nr_pages = compound_nr(head);
+ 
+-			if (head_pfn + nr_pages <= boundary_pfn) {
++			if (head_pfn + nr_pages <= boundary_pfn ||
++			    PageHuge(page)) {
+ 				pfn = head_pfn + nr_pages;
+ 				continue;
+ 			}
+-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++
+ 			/*
+-			 * hugetlb, lru compound (THP), and movable compound pages
+-			 * can be migrated. Otherwise, fail the isolation.
++			 * These pages are movable too, but they're
++			 * not expected to exceed pageblock_order.
++			 *
++			 * Let us know when they do, so we can add
++			 * proper free and split handling for them.
+ 			 */
+-			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+-				int order;
+-				unsigned long outer_pfn;
+-				int page_mt = get_pageblock_migratetype(page);
+-				bool isolate_page = !is_migrate_isolate_page(page);
+-				struct compact_control cc = {
+-					.nr_migratepages = 0,
+-					.order = -1,
+-					.zone = page_zone(pfn_to_page(head_pfn)),
+-					.mode = MIGRATE_SYNC,
+-					.ignore_skip_hint = true,
+-					.no_set_skip_hint = true,
+-					.gfp_mask = gfp_flags,
+-					.alloc_contig = true,
+-				};
+-				INIT_LIST_HEAD(&cc.migratepages);
+-
+-				/*
+-				 * XXX: mark the page as MIGRATE_ISOLATE so that
+-				 * no one else can grab the freed page after migration.
+-				 * Ideally, the page should be freed as two separate
+-				 * pages to be added into separate migratetype free
+-				 * lists.
+-				 */
+-				if (isolate_page) {
+-					ret = set_migratetype_isolate(page, page_mt,
+-						flags, head_pfn, head_pfn + nr_pages);
+-					if (ret)
+-						goto failed;
+-				}
+-
+-				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+-							head_pfn + nr_pages, page_mt);
++			VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
++			VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+ 
+-				/*
+-				 * restore the page's migratetype so that it can
+-				 * be split into separate migratetype free lists
+-				 * later.
+-				 */
+-				if (isolate_page)
+-					unset_migratetype_isolate(page, page_mt);
+-
+-				if (ret)
+-					goto failed;
+-				/*
+-				 * reset pfn to the head of the free page, so
+-				 * that the free page handling code above can split
+-				 * the free page to the right migratetype list.
+-				 *
+-				 * head_pfn is not used here as a hugetlb page order
+-				 * can be bigger than MAX_ORDER, but after it is
+-				 * freed, the free page order is not. Use pfn within
+-				 * the range to find the head of the free page.
+-				 */
+-				order = 0;
+-				outer_pfn = pfn;
+-				while (!PageBuddy(pfn_to_page(outer_pfn))) {
+-					/* stop if we cannot find the free page */
+-					if (++order > MAX_ORDER)
+-						goto failed;
+-					outer_pfn &= ~0UL << order;
+-				}
+-				pfn = outer_pfn;
+-				continue;
+-			} else
+-#endif
+-				goto failed;
++			goto failed;
+ 		}
+ 
+ 		pfn++;
+diff --git a/mm/readahead.c b/mm/readahead.c
+index 438f142a3e74..c13c130efcca 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl,
+ 
+ 	limit = min(limit, index + ra->size - 1);
+ 
+-	if (new_order < MAX_PAGECACHE_ORDER)
++	if (new_order < mapping_max_folio_order(mapping))
+ 		new_order += 2;
+ 
+-	new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
++	new_order = min(mapping_max_folio_order(mapping), new_order);
+ 	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ 
+ 	/* See comment in page_cache_ra_unbounded() */
+diff --git a/mm/swap_slots.c b/mm/swap_slots.c
+index 7af3b93d4c8c..5579eed7065f 100644
+--- a/mm/swap_slots.c
++++ b/mm/swap_slots.c
+@@ -34,6 +34,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/mutex.h>
+ #include <linux/mm.h>
++#include <linux/zswap.h>
+ 
+ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry)
+ {
+ 	struct swap_slots_cache *cache;
+ 
++	/* Large folio swap slot is not covered. */
++	zswap_invalidate(entry);
++
+ 	cache = raw_cpu_ptr(&swp_slots);
+ 	if (likely(use_swap_slot_cache && cache->slots_ret)) {
+ 		spin_lock_irq(&cache->free_lock);
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 3af5b6ebb241..3b48159820f2 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -52,6 +52,15 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
++				  unsigned int nr_pages);
++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
++			     unsigned int nr_entries);
++static bool folio_swapcache_freeable(struct folio *folio);
++static struct swap_cluster_info *lock_cluster_or_swap_info(
++		struct swap_info_struct *si, unsigned long offset);
++static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
++					struct swap_cluster_info *ci);
+ 
+ static DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -126,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent)
+  * corresponding page
+  */
+ #define TTRS_UNMAPPED		0x2
+-/* Reclaim the swap entry if swap is getting full*/
++/* Reclaim the swap entry if swap is getting full */
+ #define TTRS_FULL		0x4
++/* Reclaim directly, bypass the slot cache and don't touch device lock */
++#define TTRS_DIRECT		0x8
++
++static bool swap_is_has_cache(struct swap_info_struct *si,
++			      unsigned long offset, int nr_pages)
++{
++	unsigned char *map = si->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++
++	do {
++		VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
++		if (*map != SWAP_HAS_CACHE)
++			return false;
++	} while (++map < map_end);
++
++	return true;
++}
+ 
+ /*
+  * returns number of pages in the folio that backs the swap entry. If positive,
+@@ -138,12 +164,19 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 				 unsigned long offset, unsigned long flags)
+ {
+ 	swp_entry_t entry = swp_entry(si->type, offset);
++	struct address_space *address_space = swap_address_space(entry);
++	struct swap_cluster_info *ci;
+ 	struct folio *folio;
+-	int ret = 0;
++	int ret, nr_pages;
++	bool need_reclaim;
+ 
+-	folio = filemap_get_folio(swap_address_space(entry), offset);
++	folio = filemap_get_folio(address_space, offset);
+ 	if (IS_ERR(folio))
+ 		return 0;
++
++	nr_pages = folio_nr_pages(folio);
++	ret = -nr_pages;
++
+ 	/*
+ 	 * When this function is called from scan_swap_map_slots() and it's
+ 	 * called by vmscan.c at reclaiming folios. So we hold a folio lock
+@@ -151,14 +184,54 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	 * case and you should use folio_free_swap() with explicit folio_lock()
+ 	 * in usual operations.
+ 	 */
+-	if (folio_trylock(folio)) {
+-		if ((flags & TTRS_ANYWAY) ||
+-		    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+-		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+-			ret = folio_free_swap(folio);
+-		folio_unlock(folio);
++	if (!folio_trylock(folio))
++		goto out;
++
++	/* offset could point to the middle of a large folio */
++	entry = folio->swap;
++	offset = swp_offset(entry);
++
++	need_reclaim = ((flags & TTRS_ANYWAY) ||
++			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
++			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
++	if (!need_reclaim || !folio_swapcache_freeable(folio))
++		goto out_unlock;
++
++	/*
++	 * It's safe to delete the folio from swap cache only if the folio's
++	 * swap_map is HAS_CACHE only, which means the slots have no page table
++	 * reference or pending writeback, and can't be allocated to others.
++	 */
++	ci = lock_cluster_or_swap_info(si, offset);
++	need_reclaim = swap_is_has_cache(si, offset, nr_pages);
++	unlock_cluster_or_swap_info(si, ci);
++	if (!need_reclaim)
++		goto out_unlock;
++
++	if (!(flags & TTRS_DIRECT)) {
++		/* Free through slot cache */
++		delete_from_swap_cache(folio);
++		folio_set_dirty(folio);
++		ret = nr_pages;
++		goto out_unlock;
+ 	}
+-	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
++
++	xa_lock_irq(&address_space->i_pages);
++	__delete_from_swap_cache(folio, entry, NULL);
++	xa_unlock_irq(&address_space->i_pages);
++	folio_ref_sub(folio, nr_pages);
++	folio_set_dirty(folio);
++
++	spin_lock(&si->lock);
++	/* Only sinple page folio can be backed by zswap */
++	if (nr_pages == 1)
++		zswap_invalidate(entry);
++	swap_entry_range_free(si, entry, nr_pages);
++	spin_unlock(&si->lock);
++	ret = nr_pages;
++out_unlock:
++	folio_unlock(folio);
++out:
+ 	folio_put(folio);
+ 	return ret;
+ }
+@@ -289,62 +362,21 @@ static void discard_swap_cluster(struct swap_info_struct *si,
+ #endif
+ #define LATENCY_LIMIT		256
+ 
+-static inline void cluster_set_flag(struct swap_cluster_info *info,
+-	unsigned int flag)
+-{
+-	info->flags = flag;
+-}
+-
+-static inline unsigned int cluster_count(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_count(struct swap_cluster_info *info,
+-				     unsigned int c)
+-{
+-	info->data = c;
+-}
+-
+-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+-					 unsigned int c, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = c;
+-}
+-
+-static inline unsigned int cluster_next(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_next(struct swap_cluster_info *info,
+-				    unsigned int n)
+-{
+-	info->data = n;
+-}
+-
+-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+-					 unsigned int n, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = n;
+-}
+-
+ static inline bool cluster_is_free(struct swap_cluster_info *info)
+ {
+ 	return info->flags & CLUSTER_FLAG_FREE;
+ }
+ 
+-static inline bool cluster_is_null(struct swap_cluster_info *info)
++static inline unsigned int cluster_index(struct swap_info_struct *si,
++					 struct swap_cluster_info *ci)
+ {
+-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
++	return ci - si->cluster_info;
+ }
+ 
+-static inline void cluster_set_null(struct swap_cluster_info *info)
++static inline unsigned int cluster_offset(struct swap_info_struct *si,
++					  struct swap_cluster_info *ci)
+ {
+-	info->flags = CLUSTER_FLAG_NEXT_NULL;
+-	info->data = 0;
++	return cluster_index(si, ci) * SWAPFILE_CLUSTER;
+ }
+ 
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+@@ -393,65 +425,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ 		spin_unlock(&si->lock);
+ }
+ 
+-static inline bool cluster_list_empty(struct swap_cluster_list *list)
+-{
+-	return cluster_is_null(&list->head);
+-}
+-
+-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+-{
+-	return cluster_next(&list->head);
+-}
+-
+-static void cluster_list_init(struct swap_cluster_list *list)
+-{
+-	cluster_set_null(&list->head);
+-	cluster_set_null(&list->tail);
+-}
+-
+-static void cluster_list_add_tail(struct swap_cluster_list *list,
+-				  struct swap_cluster_info *ci,
+-				  unsigned int idx)
+-{
+-	if (cluster_list_empty(list)) {
+-		cluster_set_next_flag(&list->head, idx, 0);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	} else {
+-		struct swap_cluster_info *ci_tail;
+-		unsigned int tail = cluster_next(&list->tail);
+-
+-		/*
+-		 * Nested cluster lock, but both cluster locks are
+-		 * only acquired when we held swap_info_struct->lock
+-		 */
+-		ci_tail = ci + tail;
+-		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+-		cluster_set_next(ci_tail, idx);
+-		spin_unlock(&ci_tail->lock);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	}
+-}
+-
+-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+-					   struct swap_cluster_info *ci)
+-{
+-	unsigned int idx;
+-
+-	idx = cluster_next(&list->head);
+-	if (cluster_next(&list->tail) == idx) {
+-		cluster_set_null(&list->head);
+-		cluster_set_null(&list->tail);
+-	} else
+-		cluster_set_next_flag(&list->head,
+-				      cluster_next(&ci[idx]), 0);
+-
+-	return idx;
+-}
+-
+ /* Add a cluster to discard list and schedule it to do discard */
+ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+-		unsigned int idx)
++		struct swap_cluster_info *ci)
+ {
++	unsigned int idx = cluster_index(si, ci);
+ 	/*
+ 	 * If scan_swap_map_slots() can't find a free cluster, it will check
+ 	 * si->swap_map directly. To make sure the discarding cluster isn't
+@@ -461,17 +439,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+-	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
+-
++	VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
++	list_move_tail(&ci->list, &si->discard_clusters);
++	ci->flags = 0;
+ 	schedule_work(&si->discard_work);
+ }
+ 
+-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 
+-	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+-	cluster_list_add_tail(&si->free_clusters, ci, idx);
++	if (ci->flags)
++		list_move_tail(&ci->list, &si->free_clusters);
++	else
++		list_add_tail(&ci->list, &si->free_clusters);
++	ci->flags = CLUSTER_FLAG_FREE;
++	ci->order = 0;
+ }
+ 
+ /*
+@@ -480,24 +464,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+ */
+ static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ {
+-	struct swap_cluster_info *info, *ci;
++	struct swap_cluster_info *ci;
+ 	unsigned int idx;
+ 
+-	info = si->cluster_info;
+-
+-	while (!cluster_list_empty(&si->discard_clusters)) {
+-		idx = cluster_list_del_first(&si->discard_clusters, info);
++	while (!list_empty(&si->discard_clusters)) {
++		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
++		list_del(&ci->list);
++		idx = cluster_index(si, ci);
+ 		spin_unlock(&si->lock);
+ 
+ 		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+ 				SWAPFILE_CLUSTER);
+ 
+ 		spin_lock(&si->lock);
+-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+-		__free_cluster(si, idx);
++		spin_lock(&ci->lock);
++		__free_cluster(si, ci);
+ 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 				0, SWAPFILE_CLUSTER);
+-		unlock_cluster(ci);
++		spin_unlock(&ci->lock);
+ 	}
+ }
+ 
+@@ -520,20 +504,15 @@ static void swap_users_ref_free(struct percpu_ref *ref)
+ 	complete(&si->comp);
+ }
+ 
+-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
++	VM_BUG_ON(ci->count != 0);
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 
+-	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+-	cluster_list_del_first(&si->free_clusters, ci);
+-	cluster_set_count_flag(ci + idx, 0, 0);
+-}
+-
+-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	struct swap_cluster_info *ci = si->cluster_info + idx;
++	if (ci->flags & CLUSTER_FLAG_FRAG)
++		si->frag_cluster_nr[ci->order]--;
+ 
+-	VM_BUG_ON(cluster_count(ci) != 0);
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -541,160 +520,374 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	 */
+ 	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ 	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+-		swap_cluster_schedule_discard(si, idx);
++		swap_cluster_schedule_discard(si, ci);
+ 		return;
+ 	}
+ 
+-	__free_cluster(si, idx);
++	__free_cluster(si, ci);
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by
+- * count.
++ * The cluster corresponding to page_nr will be used. The cluster will not be
++ * added to free cluster list and its usage counter will be increased by 1.
++ * Only used for initialization.
+  */
+-static void add_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr,
+-	unsigned long count)
++static void inc_cluster_info_page(struct swap_info_struct *p,
++	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
++	struct swap_cluster_info *ci;
+ 
+ 	if (!cluster_info)
+ 		return;
+-	if (cluster_is_free(&cluster_info[idx]))
+-		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) + count);
+-}
++	ci = cluster_info + idx;
++	ci->count++;
+ 
+-/*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by 1.
+- */
+-static void inc_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+-{
+-	add_cluster_info_page(p, cluster_info, page_nr, 1);
++	VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
++	VM_BUG_ON(ci->flags);
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr decreases one usage. If the usage
+- * counter becomes 0, which means no page in the cluster is in using, we can
+- * optionally discard the cluster and add it to free cluster list.
++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
++ * which means no page in the cluster is in use, we can optionally discard
++ * the cluster and add it to free cluster list.
+  */
+ static void dec_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
++				  struct swap_cluster_info *ci, int nr_pages)
+ {
+-	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+-
+-	if (!cluster_info)
++	if (!p->cluster_info)
+ 		return;
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) - 1);
++	VM_BUG_ON(ci->count < nr_pages);
++	VM_BUG_ON(cluster_is_free(ci));
++	lockdep_assert_held(&p->lock);
++	lockdep_assert_held(&ci->lock);
++	ci->count -= nr_pages;
++
++	if (!ci->count) {
++		free_cluster(p, ci);
++		return;
++	}
+ 
+-	if (cluster_count(&cluster_info[idx]) == 0)
+-		free_cluster(p, idx);
++	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
++		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			p->frag_cluster_nr[ci->order]--;
++		list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		ci->flags = CLUSTER_FLAG_NONFULL;
++	}
+ }
+ 
+-/*
+- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
+- * cluster list. Avoiding such abuse to avoid list corruption.
+- */
+-static bool
+-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+-	unsigned long offset, int order)
++static bool cluster_reclaim_range(struct swap_info_struct *si,
++				  struct swap_cluster_info *ci,
++				  unsigned long start, unsigned long end)
+ {
+-	struct percpu_cluster *percpu_cluster;
+-	bool conflict;
++	unsigned char *map = si->swap_map;
++	unsigned long offset;
+ 
+-	offset /= SWAPFILE_CLUSTER;
+-	conflict = !cluster_list_empty(&si->free_clusters) &&
+-		offset != cluster_list_first(&si->free_clusters) &&
+-		cluster_is_free(&si->cluster_info[offset]);
++	spin_unlock(&ci->lock);
++	spin_unlock(&si->lock);
+ 
+-	if (!conflict)
+-		return false;
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
++				continue;
++			goto out;
++		default:
++			goto out;
++		}
++	}
++out:
++	spin_lock(&si->lock);
++	spin_lock(&ci->lock);
++
++	/*
++	 * Recheck the range no matter reclaim succeeded or not, the slot
++	 * could have been be freed while we are not holding the lock.
++	 */
++	for (offset = start; offset < end; offset++)
++		if (READ_ONCE(map[offset]))
++			return false;
+ 
+-	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+-	percpu_cluster->next[order] = SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+-static inline bool swap_range_empty(char *swap_map, unsigned int start,
+-				    unsigned int nr_pages)
++static bool cluster_scan_range(struct swap_info_struct *si,
++			       struct swap_cluster_info *ci,
++			       unsigned long start, unsigned int nr_pages)
+ {
+-	unsigned int i;
++	unsigned long offset, end = start + nr_pages;
++	unsigned char *map = si->swap_map;
++	bool need_reclaim = false;
+ 
+-	for (i = 0; i < nr_pages; i++) {
+-		if (swap_map[start + i])
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (!vm_swap_full())
++				return false;
++			need_reclaim = true;
++			continue;
++		default:
+ 			return false;
++		}
+ 	}
+ 
++	if (need_reclaim)
++		return cluster_reclaim_range(si, ci, start, end);
++
+ 	return true;
+ }
+ 
++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++				unsigned int start, unsigned char usage,
++				unsigned int order)
++{
++	unsigned int nr_pages = 1 << order;
++
++	if (!(si->flags & SWP_WRITEOK))
++		return false;
++
++	if (cluster_is_free(ci)) {
++		if (nr_pages < SWAPFILE_CLUSTER) {
++			list_move_tail(&ci->list, &si->nonfull_clusters[order]);
++			ci->flags = CLUSTER_FLAG_NONFULL;
++		}
++		ci->order = order;
++	}
++
++	memset(si->swap_map + start, usage, nr_pages);
++	swap_range_alloc(si, start, nr_pages);
++	ci->count += nr_pages;
++
++	if (ci->count == SWAPFILE_CLUSTER) {
++		VM_BUG_ON(!(ci->flags &
++			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			si->frag_cluster_nr[ci->order]--;
++		list_move_tail(&ci->list, &si->full_clusters);
++		ci->flags = CLUSTER_FLAG_FULL;
++	}
++
++	return true;
++}
++
++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
++					    unsigned int *foundp, unsigned int order,
++					    unsigned char usage)
++{
++	unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
++	unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
++	unsigned int nr_pages = 1 << order;
++	struct swap_cluster_info *ci;
++
++	if (end < nr_pages)
++		return SWAP_NEXT_INVALID;
++	end -= nr_pages;
++
++	ci = lock_cluster(si, offset);
++	if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
++		offset = SWAP_NEXT_INVALID;
++		goto done;
++	}
++
++	while (offset <= end) {
++		if (cluster_scan_range(si, ci, offset, nr_pages)) {
++			if (!cluster_alloc_range(si, ci, offset, usage, order)) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
++			*foundp = offset;
++			if (ci->count == SWAPFILE_CLUSTER) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
++			offset += nr_pages;
++			break;
++		}
++		offset += nr_pages;
++	}
++	if (offset > end)
++		offset = SWAP_NEXT_INVALID;
++done:
++	unlock_cluster(ci);
++	return offset;
++}
++
++/* Return true if reclaimed a whole cluster */
++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
++{
++	long to_scan = 1;
++	unsigned long offset, end;
++	struct swap_cluster_info *ci;
++	unsigned char *map = si->swap_map;
++	int nr_reclaim;
++
++	if (force)
++		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
++
++	while (!list_empty(&si->full_clusters)) {
++		ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
++		list_move_tail(&ci->list, &si->full_clusters);
++		offset = cluster_offset(si, ci);
++		end = min(si->max, offset + SWAPFILE_CLUSTER);
++		to_scan--;
++
++		spin_unlock(&si->lock);
++		while (offset < end) {
++			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
++				nr_reclaim = __try_to_reclaim_swap(si, offset,
++								   TTRS_ANYWAY | TTRS_DIRECT);
++				if (nr_reclaim) {
++					offset += abs(nr_reclaim);
++					continue;
++				}
++			}
++			offset++;
++		}
++		spin_lock(&si->lock);
++
++		if (to_scan <= 0)
++			break;
++	}
++}
++
++static void swap_reclaim_work(struct work_struct *work)
++{
++	struct swap_info_struct *si;
++
++	si = container_of(work, struct swap_info_struct, reclaim_work);
++
++	spin_lock(&si->lock);
++	swap_reclaim_full_clusters(si, true);
++	spin_unlock(&si->lock);
++}
++
+ /*
+  * Try to get swap entries with specified order from current cpu's swap entry
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+  * too.
+  */
+-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+-	unsigned long *offset, unsigned long *scan_base, int order)
++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
++					      unsigned char usage)
+ {
+-	unsigned int nr_pages = 1 << order;
+ 	struct percpu_cluster *cluster;
+ 	struct swap_cluster_info *ci;
+-	unsigned int tmp, max;
++	unsigned int offset, found = 0;
+ 
+ new_cluster:
++	lockdep_assert_held(&si->lock);
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+-	tmp = cluster->next[order];
+-	if (tmp == SWAP_NEXT_INVALID) {
+-		if (!cluster_list_empty(&si->free_clusters)) {
+-			tmp = cluster_next(&si->free_clusters.head) *
+-					SWAPFILE_CLUSTER;
+-		} else if (!cluster_list_empty(&si->discard_clusters)) {
+-			/*
+-			 * we don't have free cluster but have some clusters in
+-			 * discarding, do discard now and reclaim them, then
+-			 * reread cluster_next_cpu since we dropped si->lock
+-			 */
+-			swap_do_scheduled_discard(si);
+-			*scan_base = this_cpu_read(*si->cluster_next_cpu);
+-			*offset = *scan_base;
+-			goto new_cluster;
+-		} else
+-			return false;
++	offset = cluster->next[order];
++	if (offset) {
++		offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
++		if (found)
++			goto done;
+ 	}
+ 
+-	/*
+-	 * Other CPUs can use our cluster if they can't find a free cluster,
+-	 * check if there is still free entry in the cluster, maintaining
+-	 * natural alignment.
+-	 */
+-	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+-	if (tmp < max) {
+-		ci = lock_cluster(si, tmp);
+-		while (tmp < max) {
+-			if (swap_range_empty(si->swap_map, tmp, nr_pages))
++	if (!list_empty(&si->free_clusters)) {
++		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
++		/*
++		 * Either we didn't touch the cluster due to swapoff,
++		 * or the allocation must success.
++		 */
++		VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
++		goto done;
++	}
++
++	/* Try reclaim from full clusters if free clusters list is drained */
++	if (vm_swap_full())
++		swap_reclaim_full_clusters(si, false);
++
++	if (order < PMD_ORDER) {
++		unsigned int frags = 0;
++
++		while (!list_empty(&si->nonfull_clusters[order])) {
++			ci = list_first_entry(&si->nonfull_clusters[order],
++					      struct swap_cluster_info, list);
++			list_move_tail(&ci->list, &si->frag_clusters[order]);
++			ci->flags = CLUSTER_FLAG_FRAG;
++			si->frag_cluster_nr[order]++;
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, order, usage);
++			frags++;
++			if (found)
+ 				break;
+-			tmp += nr_pages;
+ 		}
+-		unlock_cluster(ci);
++
++		if (!found) {
++			/*
++			 * Nonfull clusters are moved to frag tail if we reached
++			 * here, count them too, don't over scan the frag list.
++			 */
++			while (frags < si->frag_cluster_nr[order]) {
++				ci = list_first_entry(&si->frag_clusters[order],
++						      struct swap_cluster_info, list);
++				/*
++				 * Rotate the frag list to iterate, they were all failing
++				 * high order allocation or moved here due to per-CPU usage,
++				 * this help keeping usable cluster ahead.
++				 */
++				list_move_tail(&ci->list, &si->frag_clusters[order]);
++				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++								 &found, order, usage);
++				frags++;
++				if (found)
++					break;
++			}
++		}
+ 	}
+-	if (tmp >= max) {
+-		cluster->next[order] = SWAP_NEXT_INVALID;
++
++	if (found)
++		goto done;
++
++	if (!list_empty(&si->discard_clusters)) {
++		/*
++		 * we don't have free cluster but have some clusters in
++		 * discarding, do discard now and reclaim them, then
++		 * reread cluster_next_cpu since we dropped si->lock
++		 */
++		swap_do_scheduled_discard(si);
+ 		goto new_cluster;
+ 	}
+-	*offset = tmp;
+-	*scan_base = tmp;
+-	tmp += nr_pages;
+-	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+-	return true;
++
++	if (order)
++		goto done;
++
++	/* Order 0 stealing from higher order */
++	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
++		/*
++		 * Clusters here have at least one usable slots and can't fail order 0
++		 * allocation, but reclaim may drop si->lock and race with another user.
++		 */
++		while (!list_empty(&si->frag_clusters[o])) {
++			ci = list_first_entry(&si->frag_clusters[o],
++					      struct swap_cluster_info, list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			if (found)
++				goto done;
++		}
++
++		while (!list_empty(&si->nonfull_clusters[o])) {
++			ci = list_first_entry(&si->nonfull_clusters[o],
++					      struct swap_cluster_info, list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			if (found)
++				goto done;
++		}
++	}
++
++done:
++	cluster->next[order] = offset;
++	return found;
+ }
+ 
+ static void __del_from_avail_list(struct swap_info_struct *p)
+@@ -727,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
+ 		del_from_avail_list(si);
++
++		if (si->cluster_info && vm_swap_full())
++			schedule_work(&si->reclaim_work);
+ 	}
+ }
+ 
+@@ -765,7 +961,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ 		swap_slot_free_notify = NULL;
+ 	while (offset <= end) {
+ 		arch_swap_invalidate_page(si->type, offset);
+-		zswap_invalidate(si->type, offset);
+ 		if (swap_slot_free_notify)
+ 			swap_slot_free_notify(si->bdev, offset);
+ 		offset++;
+@@ -816,11 +1011,33 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ 	return false;
+ }
+ 
++static int cluster_alloc_swap(struct swap_info_struct *si,
++			     unsigned char usage, int nr,
++			     swp_entry_t slots[], int order)
++{
++	int n_ret = 0;
++
++	VM_BUG_ON(!si->cluster_info);
++
++	si->flags += SWP_SCANNING;
++
++	while (n_ret < nr) {
++		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
++
++		if (!offset)
++			break;
++		slots[n_ret++] = swp_entry(si->type, offset);
++	}
++
++	si->flags -= SWP_SCANNING;
++
++	return n_ret;
++}
++
+ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			       unsigned char usage, int nr,
+ 			       swp_entry_t slots[], int order)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset;
+ 	unsigned long scan_base;
+ 	unsigned long last_in_cluster = 0;
+@@ -859,26 +1076,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			return 0;
+ 	}
+ 
++	if (si->cluster_info)
++		return cluster_alloc_swap(si, usage, nr, slots, order);
++
+ 	si->flags += SWP_SCANNING;
+-	/*
+-	 * Use percpu scan base for SSD to reduce lock contention on
+-	 * cluster and swap cache.  For HDD, sequential access is more
+-	 * important.
+-	 */
+-	if (si->flags & SWP_SOLIDSTATE)
+-		scan_base = this_cpu_read(*si->cluster_next_cpu);
+-	else
+-		scan_base = si->cluster_next;
++
++	/* For HDD, sequential access is more important. */
++	scan_base = si->cluster_next;
+ 	offset = scan_base;
+ 
+-	/* SSD algorithm */
+-	if (si->cluster_info) {
+-		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
+-			if (order > 0)
+-				goto no_page;
+-			goto scan;
+-		}
+-	} else if (unlikely(!si->cluster_nr--)) {
++	if (unlikely(!si->cluster_nr--)) {
+ 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ 			goto checks;
+@@ -889,8 +1096,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		/*
+ 		 * If seek is expensive, start searching for new cluster from
+ 		 * start of partition, to minimize the span of allocated swap.
+-		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+-		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ 		 */
+ 		scan_base = offset = si->lowest_bit;
+ 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+@@ -918,19 +1123,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ checks:
+-	if (si->cluster_info) {
+-		while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
+-		/* take a break if we already got some slots */
+-			if (n_ret)
+-				goto done;
+-			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+-							&scan_base, order)) {
+-				if (order > 0)
+-					goto no_page;
+-				goto scan;
+-			}
+-		}
+-	}
+ 	if (!(si->flags & SWP_WRITEOK))
+ 		goto no_page;
+ 	if (!si->highest_bit)
+@@ -938,13 +1130,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	if (offset > si->highest_bit)
+ 		scan_base = offset = si->lowest_bit;
+ 
+-	ci = lock_cluster(si, offset);
+ 	/* reuse swap entry of cache-only swap if not busy. */
+ 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ 		int swap_was_freed;
+-		unlock_cluster(ci);
+ 		spin_unlock(&si->lock);
+-		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
++		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
+ 		spin_lock(&si->lock);
+ 		/* entry was freed successfully, try to use this again */
+ 		if (swap_was_freed > 0)
+@@ -953,15 +1143,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ 	if (si->swap_map[offset]) {
+-		unlock_cluster(ci);
+ 		if (!n_ret)
+ 			goto scan;
+ 		else
+ 			goto done;
+ 	}
+ 	memset(si->swap_map + offset, usage, nr_pages);
+-	add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
+-	unlock_cluster(ci);
+ 
+ 	swap_range_alloc(si, offset, nr_pages);
+ 	slots[n_ret++] = swp_entry(si->type, offset);
+@@ -982,13 +1169,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		latency_ration = LATENCY_LIMIT;
+ 	}
+ 
+-	/* try to get more slots in cluster */
+-	if (si->cluster_info) {
+-		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
+-			goto checks;
+-		if (order > 0)
+-			goto done;
+-	} else if (si->cluster_nr && !si->swap_map[++offset]) {
++	if (si->cluster_nr && !si->swap_map[++offset]) {
+ 		/* non-ssd case, still more slots in cluster? */
+ 		--si->cluster_nr;
+ 		goto checks;
+@@ -1049,19 +1230,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	return n_ret;
+ }
+ 
+-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	unsigned long offset = idx * SWAPFILE_CLUSTER;
+-	struct swap_cluster_info *ci;
+-
+-	ci = lock_cluster(si, offset);
+-	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+-	cluster_set_count_flag(ci, 0, 0);
+-	free_cluster(si, idx);
+-	unlock_cluster(ci);
+-	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+-}
+-
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+ int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type)
+ {
+@@ -1409,21 +1577,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ 	return usage;
+ }
+ 
+-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
++/*
++ * Drop the last HAS_CACHE flag of swap entries, caller have to
++ * ensure all entries belong to the same cgroup.
++ */
++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
++				  unsigned int nr_pages)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned char count;
++	unsigned char *map = p->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++	struct swap_cluster_info *ci;
+ 
+ 	ci = lock_cluster(p, offset);
+-	count = p->swap_map[offset];
+-	VM_BUG_ON(count != SWAP_HAS_CACHE);
+-	p->swap_map[offset] = 0;
+-	dec_cluster_info_page(p, p->cluster_info, offset);
++	do {
++		VM_BUG_ON(*map != SWAP_HAS_CACHE);
++		*map = 0;
++	} while (++map < map_end);
++	dec_cluster_info_page(p, ci, nr_pages);
+ 	unlock_cluster(ci);
+ 
+-	mem_cgroup_uncharge_swap(entry, 1);
+-	swap_range_free(p, offset, 1);
++	mem_cgroup_uncharge_swap(entry, nr_pages);
++	swap_range_free(p, offset, nr_pages);
+ }
+ 
+ static void cluster_swap_free_nr(struct swap_info_struct *sis,
+@@ -1484,12 +1659,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
+ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ {
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned long idx = offset / SWAPFILE_CLUSTER;
+ 	struct swap_cluster_info *ci;
+ 	struct swap_info_struct *si;
+-	unsigned char *map;
+-	unsigned int i, free_entries = 0;
+-	unsigned char val;
+ 	int size = 1 << swap_entry_order(folio_order(folio));
+ 
+ 	si = _swap_info_get(entry);
+@@ -1497,24 +1668,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 		return;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (size == SWAPFILE_CLUSTER) {
+-		map = si->swap_map + offset;
+-		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+-			val = map[i];
+-			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+-			if (val == SWAP_HAS_CACHE)
+-				free_entries++;
+-		}
+-		if (free_entries == SWAPFILE_CLUSTER) {
+-			unlock_cluster_or_swap_info(si, ci);
+-			spin_lock(&si->lock);
+-			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+-			swap_free_cluster(si, idx);
+-			spin_unlock(&si->lock);
+-			return;
+-		}
++	if (size > 1 && swap_is_has_cache(si, offset, size)) {
++		unlock_cluster_or_swap_info(si, ci);
++		spin_lock(&si->lock);
++		swap_entry_range_free(si, entry, size);
++		spin_unlock(&si->lock);
++		return;
+ 	}
+-	for (i = 0; i < size; i++, entry.val++) {
++	for (int i = 0; i < size; i++, entry.val++) {
+ 		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			free_swap_slot(entry);
+@@ -1554,7 +1715,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
+ 	for (i = 0; i < n; ++i) {
+ 		p = swap_info_get_cont(entries[i], prev);
+ 		if (p)
+-			swap_entry_free(p, entries[i]);
++			swap_entry_range_free(p, entries[i], 1);
+ 		prev = p;
+ 	}
+ 	if (p)
+@@ -1674,16 +1835,7 @@ static bool folio_swapped(struct folio *folio)
+ 	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
+ }
+ 
+-/**
+- * folio_free_swap() - Free the swap space used for this folio.
+- * @folio: The folio to remove.
+- *
+- * If swap is getting full, or if there are no more mappings of this folio,
+- * then call folio_free_swap to free its swap space.
+- *
+- * Return: true if we were able to release the swap space.
+- */
+-bool folio_free_swap(struct folio *folio)
++static bool folio_swapcache_freeable(struct folio *folio)
+ {
+ 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ 
+@@ -1691,8 +1843,6 @@ bool folio_free_swap(struct folio *folio)
+ 		return false;
+ 	if (folio_test_writeback(folio))
+ 		return false;
+-	if (folio_swapped(folio))
+-		return false;
+ 
+ 	/*
+ 	 * Once hibernation has begun to create its image of memory,
+@@ -1712,6 +1862,25 @@ bool folio_free_swap(struct folio *folio)
+ 	if (pm_suspended_storage())
+ 		return false;
+ 
++	return true;
++}
++
++/**
++ * folio_free_swap() - Free the swap space used for this folio.
++ * @folio: The folio to remove.
++ *
++ * If swap is getting full, or if there are no more mappings of this folio,
++ * then call folio_free_swap to free its swap space.
++ *
++ * Return: true if we were able to release the swap space.
++ */
++bool folio_free_swap(struct folio *folio)
++{
++	if (!folio_swapcache_freeable(folio))
++		return false;
++	if (folio_swapped(folio))
++		return false;
++
+ 	delete_from_swap_cache(folio);
+ 	folio_set_dirty(folio);
+ 	return true;
+@@ -1788,7 +1957,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+ 			 * to the next boundary.
+ 			 */
+ 			nr = __try_to_reclaim_swap(si, offset,
+-					      TTRS_UNMAPPED | TTRS_FULL);
++						   TTRS_UNMAPPED | TTRS_FULL);
+ 			if (nr == 0)
+ 				nr = 1;
+ 			else if (nr < 0)
+@@ -2686,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	wait_for_completion(&p->comp);
+ 
+ 	flush_work(&p->discard_work);
++	flush_work(&p->reclaim_work);
+ 
+ 	destroy_swap_extents(p);
+ 	if (p->flags & SWP_CONTINUED)
+@@ -3114,8 +3284,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 
+ 	nr_good_pages = maxpages - 1;	/* omit header page */
+ 
+-	cluster_list_init(&p->free_clusters);
+-	cluster_list_init(&p->discard_clusters);
++	INIT_LIST_HEAD(&p->free_clusters);
++	INIT_LIST_HEAD(&p->full_clusters);
++	INIT_LIST_HEAD(&p->discard_clusters);
++
++	for (i = 0; i < SWAP_NR_ORDERS; i++) {
++		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
++		INIT_LIST_HEAD(&p->frag_clusters[i]);
++		p->frag_cluster_nr[i] = 0;
++	}
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+@@ -3158,7 +3335,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	if (!cluster_info)
+ 		return nr_extents;
+ 
+-
+ 	/*
+ 	 * Reduce false cache line sharing between cluster_info and
+ 	 * sharing same address space.
+@@ -3166,14 +3342,18 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ 		j = (k + col) % SWAP_CLUSTER_COLS;
+ 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
++			struct swap_cluster_info *ci;
+ 			idx = i * SWAP_CLUSTER_COLS + j;
++			ci = cluster_info + idx;
+ 			if (idx >= nr_clusters)
+ 				continue;
+-			if (cluster_count(&cluster_info[idx]))
++			if (ci->count) {
++				ci->flags = CLUSTER_FLAG_NONFULL;
++				list_add_tail(&ci->list, &p->nonfull_clusters[0]);
+ 				continue;
+-			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+-			cluster_list_add_tail(&p->free_clusters, cluster_info,
+-					      idx);
++			}
++			ci->flags = CLUSTER_FLAG_FREE;
++			list_add_tail(&ci->list, &p->free_clusters);
+ 		}
+ 	}
+ 	return nr_extents;
+@@ -3212,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+ 		return PTR_ERR(p);
+ 
+ 	INIT_WORK(&p->discard_work, swap_discard_work);
++	INIT_WORK(&p->reclaim_work, swap_reclaim_work);
+ 
+ 	name = getname(specialfile);
+ 	if (IS_ERR(name)) {
+diff --git a/mm/zswap.c b/mm/zswap.c
+index 69681b9173fd..5acda5b906bc 100644
+--- a/mm/zswap.c
++++ b/mm/zswap.c
+@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio)
+ 	return ret;
+ }
+ 
+-void zswap_invalidate(int type, pgoff_t offset)
++void zswap_invalidate(swp_entry_t swp)
+ {
+-	struct zswap_tree *tree = zswap_trees[type];
++	pgoff_t offset = swp_offset(swp);
++	struct zswap_tree *tree = zswap_trees[swp_type(swp)];
+ 	struct zswap_entry *entry;
+ 
+ 	/* find */
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index ca8cc0988b61..bd032ac2376e 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t)
+ 
+ drop:
+ 	__inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+-	reqsk_put(req);
++	reqsk_put(oreq);
+ }
+ 
+ static bool reqsk_queue_hash_req(struct request_sock *req,
+diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
+index c112c6f7c766..9b302242be6c 100644
+--- a/tools/include/uapi/linux/bpf.h
++++ b/tools/include/uapi/linux/bpf.h
+@@ -6576,6 +6576,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
diff --git a/kernel.spec b/kernel.spec
index 884d5480d44cdbbbe954315eb3b1ef5ac30ee3c9..2fd5f70e4d2334fbcc245ef00a90aeccce2c1749 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -1,5 +1,5 @@
 %define with_signmodules  1
-%define with_kabichk 1
+%define with_kabichk 0
 
 # Default without toolchain_clang
 %bcond_with toolchain_clang
@@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig
 %global upstream_sublevel   0
 %global devel_release       68
 %global maintenance_release .0.0
-%global pkg_release         .73
+%global pkg_release         .74
 
 %global openeuler_lts       1
 %global openeuler_major     2403
@@ -128,6 +128,7 @@ Patch0001: 0001-riscv-kernel.patch
 Patch0002: 0002-cpupower-clang-compile-support.patch
 Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch
 Patch0004: 0004-turbostat-clang-compile-support.patch
+Patch0006: 0006-kabi_test.patch
 
 #BuildRequires:
 BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar
@@ -369,6 +370,8 @@ Applypatches series.conf %{_builddir}/kernel-%{version}/linux-%{KernelVer}
 %patch0004 -p1
 %endif
 
+%patch0006 -p1
+
 find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null
 find . -name .gitignore -exec rm -f {} \; >/dev/null
 
@@ -1089,6 +1092,9 @@ fi
 %endif
 
 %changelog
+* Wed Dec 18 2024 Zheng Zengkai <zhengzengkai@huawei.com> - 6.6.0-68.0.0.74
+- performance for kabi
+
 * Tue Dec 17 2024 Xie XiuQi <xiexiuqi@huawei.com> - 6.6.0-68.0.0.73
 - kabi: add kabi_ext1 list for checking
 - check-kabi: fix kabi check failed when no namespace