From 81205c6327f37b1617d25605abcf2ce510368b12 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:01:55 +0800
Subject: [PATCH 01/37] mm: make folios_put() the basis of release_pages()

mainline inclusion
from mainline-v6.9-rc1
commit 99fbb6bfc16f202adc411ad5d353db214750d121
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=99fbb6bfc16f202adc411ad5d353db214750d121

--------------------------------

Patch series "Rearrange batched folio freeing", v3.

Other than the obvious "remove calls to compound_head" changes, the
fundamental belief here is that iterating a linked list is much slower
than iterating an array (5-15x slower in my testing).  There's also an
associated belief that since we iterate the batch of folios three times,
we do better when the array is small (ie 15 entries) than we do with a
batch that is hundreds of entries long, which only gives us the
opportunity for the first pages to fall out of cache by the time we get to
the end.

It is possible we should increase the size of folio_batch.  Hopefully the
bots let us know if this introduces any performance regressions.

This patch (of 3):

By making release_pages() call folios_put(), we can get rid of the calls
to compound_head() for the callers that already know they have folios.  We
can also get rid of the lock_batch tracking as we know the size of the
batch is limited by folio_batch.  This does reduce the maximum number of
pages for which the lruvec lock is held, from SWAP_CLUSTER_MAX (32) to
PAGEVEC_SIZE (15).  I do not expect this to make a significant difference,
but if it does, we can increase PAGEVEC_SIZE to 31.

Link: https://lkml.kernel.org/r/20240227174254.710559-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240227174254.710559-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/mm.h |  16 +++++---
 mm/mlock.c         |   3 +-
 mm/swap.c          | 100 ++++++++++++++++++++++++++-------------------
 3 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 49f4fac2dcf7..ca70fe2405c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -37,6 +37,7 @@ struct anon_vma;
 struct anon_vma_chain;
 struct user_struct;
 struct pt_regs;
+struct folio_batch;
 
 extern int sysctl_page_lock_unfairness;
 
@@ -1532,6 +1533,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
 		__folio_put(folio);
 }
 
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
+
 /*
  * union release_pages_arg - an array of pages or folios
  *
@@ -1554,18 +1557,19 @@ void release_pages(release_pages_arg, int nr);
 /**
  * folios_put - Decrement the reference count on an array of folios.
  * @folios: The folios.
- * @nr: How many folios there are.
  *
- * Like folio_put(), but for an array of folios.  This is more efficient
- * than writing the loop yourself as it will optimise the locks which
- * need to be taken if the folios are freed.
+ * Like folio_put(), but for a batch of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed.  The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need to
+ * reinitialise it.
  *
  * Context: May be called in process or interrupt context, but not in NMI
  * context.  May be called while holding a spinlock.
  */
-static inline void folios_put(struct folio **folios, unsigned int nr)
+static inline void folios_put(struct folio_batch *folios)
 {
-	release_pages(folios, nr);
+	folios_put_refs(folios, NULL);
 }
 
 static inline void put_page(struct page *page)
diff --git a/mm/mlock.c b/mm/mlock.c
index f79d8262c1a0..d0b06ea3b721 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
 
 	if (lruvec)
 		unlock_page_lruvec_irq(lruvec);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 
 void mlock_drain_local(void)
diff --git a/mm/swap.c b/mm/swap.c
index e5380d732c0d..3d51f8c72017 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -89,7 +89,7 @@ static void __page_cache_release(struct folio *folio)
 		__folio_clear_lru_flags(folio);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
-	/* See comment on folio_test_mlocked in release_pages() */
+	/* See comment on folio_test_mlocked in folios_put() */
 	if (unlikely(folio_test_mlocked(folio))) {
 		long nr_pages = folio_nr_pages(folio);
 
@@ -175,7 +175,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
 	 * while the LRU lock is held.
 	 *
 	 * (That is not true of __page_cache_release(), and not necessarily
-	 * true of release_pages(): but those only clear the mlocked flag after
+	 * true of folios_put(): but those only clear the mlocked flag after
 	 * folio_put_testzero() has excluded any other users of the folio.)
 	 */
 	if (folio_evictable(folio)) {
@@ -221,8 +221,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 
 	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 
 static void folio_batch_add_and_move(struct folio_batch *fbatch,
@@ -946,47 +945,30 @@ void lru_cache_disable(void)
 }
 
 /**
- * release_pages - batched put_page()
- * @arg: array of pages to release
- * @nr: number of pages
+ * folios_put_refs - Reduce the reference count on a batch of folios.
+ * @folios: The folios.
+ * @refs: The number of refs to subtract from each folio.
  *
- * Decrement the reference count on all the pages in @arg.  If it
- * fell to zero, remove the page from the LRU and free it.
+ * Like folio_put(), but for a batch of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed.  The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need
+ * to reinitialise it.  If @refs is NULL, we subtract one from each
+ * folio refcount.
  *
- * Note that the argument can be an array of pages, encoded pages,
- * or folio pointers. We ignore any encoded bits, and turn any of
- * them into just a folio that gets free'd.
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
  */
-void release_pages(release_pages_arg arg, int nr)
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 {
 	int i;
-	struct encoded_page **encoded = arg.encoded_pages;
 	LIST_HEAD(pages_to_free);
 	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
-	unsigned int lock_batch;
 
-	for (i = 0; i < nr; i++) {
-		unsigned int nr_refs = 1;
-		struct folio *folio;
-
-		/* Turn any of the argument types into a folio */
-		folio = page_folio(encoded_page_ptr(encoded[i]));
-
-		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
-		if (unlikely(encoded_page_flags(encoded[i]) &
-			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-			nr_refs = encoded_nr_pages(encoded[++i]);
-
-		/*
-		 * Make sure the IRQ-safe lock-holding time does not get
-		 * excessive with a continuous string of pages from the
-		 * same lruvec. The lock is held only if lruvec != NULL.
-		 */
-		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
-			unlock_page_lruvec_irqrestore(lruvec, flags);
-			lruvec = NULL;
-		}
+	for (i = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
+		unsigned int nr_refs = refs ? refs[i] : 1;
 
 		if (is_huge_zero_page(&folio->page))
 			continue;
@@ -1016,13 +998,8 @@ void release_pages(release_pages_arg arg, int nr)
 		}
 
 		if (folio_test_lru(folio)) {
-			struct lruvec *prev_lruvec = lruvec;
-
 			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
 									&flags);
-			if (prev_lruvec != lruvec)
-				lock_batch = 0;
-
 			lruvec_del_folio(lruvec, folio);
 			__folio_clear_lru_flags(folio);
 		}
@@ -1046,6 +1023,47 @@ void release_pages(release_pages_arg arg, int nr)
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_unref_page_list(&pages_to_free);
+	folio_batch_reinit(folios);
+}
+EXPORT_SYMBOL(folios_put_refs);
+
+/**
+ * release_pages - batched put_page()
+ * @arg: array of pages to release
+ * @nr: number of pages
+ *
+ * Decrement the reference count on all the pages in @arg.  If it
+ * fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
+ */
+void release_pages(release_pages_arg arg, int nr)
+{
+	struct folio_batch fbatch;
+	int refs[PAGEVEC_SIZE];
+	struct encoded_page **encoded = arg.encoded_pages;
+	int i;
+
+	folio_batch_init(&fbatch);
+	for (i = 0; i < nr; i++) {
+		/* Turn any of the argument types into a folio */
+		struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
+
+		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+		refs[fbatch.nr] = 1;
+		if (unlikely(encoded_page_flags(encoded[i]) &
+			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+			refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
+
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		folios_put_refs(&fbatch, refs);
+	}
+
+	if (fbatch.nr)
+		folios_put_refs(&fbatch, refs);
 }
 EXPORT_SYMBOL(release_pages);
 
-- 
Gitee


From d2a97c8eaf3c3c6d5b544b6614503424af621452 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:01:56 +0800
Subject: [PATCH 02/37] mm: convert free_unref_page_list() to use folios

mainline inclusion
from mainline-v6.9-rc1
commit 7c76d92253dbb7c53ba03a4cd6639113cd1f7d3a
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7c76d92253dbb7c53ba03a4cd6639113cd1f7d3a

--------------------------------

Most of its callees are not yet ready to accept a folio, but we know all
of the pages passed in are actually folios because they're linked through
->lru.

Link: https://lkml.kernel.org/r/20240227174254.710559-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/page_alloc.c
[ Context conflicts with dynamic pool. ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/page_alloc.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f225f412e71d..27f13b34760a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2559,24 +2559,24 @@ void free_unref_page(struct page *page, unsigned int order)
 void free_unref_page_list(struct list_head *list)
 {
 	unsigned long __maybe_unused UP_flags;
-	struct page *page, *next;
+	struct folio *folio, *next;
 	struct per_cpu_pages *pcp = NULL;
 	struct zone *locked_zone = NULL;
 	int batch_count = 0;
 	int migratetype;
 
 	/* Prepare pages for freeing */
-	list_for_each_entry_safe(page, next, list, lru) {
-		unsigned long pfn = page_to_pfn(page);
+	list_for_each_entry_safe(folio, next, list, lru) {
+		unsigned long pfn = folio_pfn(folio);
 
-		if (page_from_dynamic_pool(page)) {
-			list_del(&page->lru);
-			dynamic_pool_free_page(page);
+		if (page_from_dynamic_pool(&folio->page)) {
+			list_del(&folio->lru);
+			dynamic_pool_free_page(&folio->page);
 			continue;
 		}
 
-		if (!free_unref_page_prepare(page, pfn, 0)) {
-			list_del(&page->lru);
+		if (!free_unref_page_prepare(&folio->page, pfn, 0)) {
+			list_del(&folio->lru);
 			continue;
 		}
 
@@ -2584,24 +2584,25 @@ void free_unref_page_list(struct list_head *list)
 		 * Free isolated pages directly to the allocator, see
 		 * comment in free_unref_page.
 		 */
-		migratetype = get_pcppage_migratetype(page);
+		migratetype = get_pcppage_migratetype(&folio->page);
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			list_del(&page->lru);
-			free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+			list_del(&folio->lru);
+			free_one_page(folio_zone(folio), &folio->page, pfn,
+					0, migratetype, FPI_NONE);
 			continue;
 		}
 	}
 
-	list_for_each_entry_safe(page, next, list, lru) {
-		struct zone *zone = page_zone(page);
+	list_for_each_entry_safe(folio, next, list, lru) {
+		struct zone *zone = folio_zone(folio);
 
-		list_del(&page->lru);
-		migratetype = get_pcppage_migratetype(page);
+		list_del(&folio->lru);
+		migratetype = get_pcppage_migratetype(&folio->page);
 
 		/*
 		 * Either different zone requiring a different pcp lock or
 		 * excessive lock hold times when freeing a large list of
-		 * pages.
+		 * folios.
 		 */
 		if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
 			if (pcp) {
@@ -2612,15 +2613,16 @@ void free_unref_page_list(struct list_head *list)
 			batch_count = 0;
 
 			/*
-			 * trylock is necessary as pages may be getting freed
+			 * trylock is necessary as folios may be getting freed
 			 * from IRQ or SoftIRQ context after an IO completion.
 			 */
 			pcp_trylock_prepare(UP_flags);
 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 			if (unlikely(!pcp)) {
 				pcp_trylock_finish(UP_flags);
-				free_one_page(zone, page, page_to_pfn(page),
-					      0, migratetype, FPI_NONE);
+				free_one_page(zone, &folio->page,
+						folio_pfn(folio), 0,
+						migratetype, FPI_NONE);
 				locked_zone = NULL;
 				continue;
 			}
@@ -2634,8 +2636,8 @@ void free_unref_page_list(struct list_head *list)
 		if (unlikely(migratetype >= MIGRATE_PCPTYPES))
 			migratetype = MIGRATE_MOVABLE;
 
-		trace_mm_page_free_batched(page);
-		free_unref_page_commit(zone, pcp, page, migratetype, 0);
+		trace_mm_page_free_batched(&folio->page);
+		free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0);
 		batch_count++;
 	}
 
-- 
Gitee


From 6f71b4ef4f100f093a22432e5d152c1308f3a07f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:01:57 +0800
Subject: [PATCH 03/37] mm: add free_unref_folios()

mainline inclusion
from mainline-v6.9-rc1
commit 90491d87dd46a4c843dae775b9e72c91624c5a7b
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=90491d87dd46a4c843dae775b9e72c91624c5a7b

--------------------------------

Iterate over a folio_batch rather than a linked list.  This is easier for
the CPU to prefetch and has a batch count naturally built in so we don't
need to track it.  Again, this lowers the maximum lock hold time from
32 folios to 15, but I do not expect this to have a significant effect.

Link: https://lkml.kernel.org/r/20240227174254.710559-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/page_alloc.c
[ Conflicts with dynamic pool, fix the code in free_unref_page_list(). ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/internal.h   |  5 +++--
 mm/page_alloc.c | 60 +++++++++++++++++++++++++++++--------------------
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 6451747b7160..9d02b5a630e4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -635,8 +635,9 @@ extern void dpool_prep_new_page(struct page *page, unsigned int order,
 #endif
 extern int user_min_free_kbytes;
 
-extern void free_unref_page(struct page *page, unsigned int order);
-extern void free_unref_page_list(struct list_head *list);
+void free_unref_page(struct page *page, unsigned int order);
+void free_unref_folios(struct folio_batch *fbatch);
+void free_unref_page_list(struct list_head *list);
 
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27f13b34760a..4e0a11a0b62a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -32,6 +32,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/pagevec.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmstat.h>
@@ -2554,64 +2555,57 @@ void free_unref_page(struct page *page, unsigned int order)
 }
 
 /*
- * Free a list of 0-order pages
+ * Free a batch of 0-order pages
  */
-void free_unref_page_list(struct list_head *list)
+void free_unref_folios(struct folio_batch *folios)
 {
 	unsigned long __maybe_unused UP_flags;
-	struct folio *folio, *next;
 	struct per_cpu_pages *pcp = NULL;
 	struct zone *locked_zone = NULL;
-	int batch_count = 0;
-	int migratetype;
+	int i, j, migratetype;
 
-	/* Prepare pages for freeing */
-	list_for_each_entry_safe(folio, next, list, lru) {
+	/* Prepare folios for freeing */
+	for (i = 0, j = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
 		unsigned long pfn = folio_pfn(folio);
 
 		if (page_from_dynamic_pool(&folio->page)) {
-			list_del(&folio->lru);
 			dynamic_pool_free_page(&folio->page);
 			continue;
 		}
 
-		if (!free_unref_page_prepare(&folio->page, pfn, 0)) {
-			list_del(&folio->lru);
+		if (!free_unref_page_prepare(&folio->page, pfn, 0))
 			continue;
-		}
 
 		/*
-		 * Free isolated pages directly to the allocator, see
+		 * Free isolated folios directly to the allocator, see
 		 * comment in free_unref_page.
 		 */
 		migratetype = get_pcppage_migratetype(&folio->page);
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			list_del(&folio->lru);
 			free_one_page(folio_zone(folio), &folio->page, pfn,
 					0, migratetype, FPI_NONE);
 			continue;
 		}
+		if (j != i)
+			folios->folios[j] = folio;
+		j++;
 	}
+	folios->nr = j;
 
-	list_for_each_entry_safe(folio, next, list, lru) {
+	for (i = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
 		struct zone *zone = folio_zone(folio);
 
-		list_del(&folio->lru);
 		migratetype = get_pcppage_migratetype(&folio->page);
 
-		/*
-		 * Either different zone requiring a different pcp lock or
-		 * excessive lock hold times when freeing a large list of
-		 * folios.
-		 */
-		if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+		/* Different zone requires a different pcp lock */
+		if (zone != locked_zone) {
 			if (pcp) {
 				pcp_spin_unlock(pcp);
 				pcp_trylock_finish(UP_flags);
 			}
 
-			batch_count = 0;
-
 			/*
 			 * trylock is necessary as folios may be getting freed
 			 * from IRQ or SoftIRQ context after an IO completion.
@@ -2638,13 +2632,31 @@ void free_unref_page_list(struct list_head *list)
 
 		trace_mm_page_free_batched(&folio->page);
 		free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0);
-		batch_count++;
 	}
 
 	if (pcp) {
 		pcp_spin_unlock(pcp);
 		pcp_trylock_finish(UP_flags);
 	}
+	folio_batch_reinit(folios);
+}
+
+void free_unref_page_list(struct list_head *list)
+{
+	struct folio_batch fbatch;
+
+	folio_batch_init(&fbatch);
+	while (!list_empty(list)) {
+		struct folio *folio = list_first_entry(list, struct folio, lru);
+
+		list_del(&folio->lru);
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		free_unref_folios(&fbatch);
+	}
+
+	if (fbatch.nr)
+		free_unref_folios(&fbatch);
 }
 
 /*
-- 
Gitee


From 5dafe154559d847bf19f2876382d949cc5e386b0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:01:58 +0800
Subject: [PATCH 04/37] mm: use folios_put() in __folio_batch_release()

mainline inclusion
from mainline-v6.9-rc1
commit 6871cc5742f411bf8ebbcb78b4afeb992d888228
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6871cc5742f411bf8ebbcb78b4afeb992d888228

--------------------------------

There's no need to indirect through release_pages() and iterate over this
batch of folios an extra time; we can just use the batch that we have.

Link: https://lkml.kernel.org/r/20240227174254.710559-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 3d51f8c72017..1cfb7b897ebd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1083,8 +1083,7 @@ void __folio_batch_release(struct folio_batch *fbatch)
 		lru_add_drain();
 		fbatch->percpu_pvec_drained = true;
 	}
-	release_pages(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 EXPORT_SYMBOL(__folio_batch_release);
 
-- 
Gitee


From 81a628bfc7ca3e5d8c0c96aa1f5c36a8e16a4db9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:01:59 +0800
Subject: [PATCH 05/37] memcg: add mem_cgroup_uncharge_folios()

mainline inclusion
from mainline-v6.9-rc1
commit 4882c80975e2bf7241a5b043eb1dbe8df2726a29
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4882c80975e2bf7241a5b043eb1dbe8df2726a29

--------------------------------

Almost identical to mem_cgroup_uncharge_list(), except it takes a
folio_batch instead of a list_head.

Link: https://lkml.kernel.org/r/20240227174254.710559-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/memcontrol.h | 14 ++++++++++++--
 mm/memcontrol.c            | 13 +++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8c199fe368c2..d137a1286777 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -837,10 +837,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }
 
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge_folios(folios);
+}
 
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
 void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
-
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -1425,6 +1431,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+}
+
 static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 		unsigned int nr_pages)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fff8b9322521..a3824f608254 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/vm_event_item.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
@@ -8601,6 +8602,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
 		uncharge_batch(&ug);
 }
 
+void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+	struct uncharge_gather ug;
+	unsigned int i;
+
+	uncharge_gather_clear(&ug);
+	for (i = 0; i < folios->nr; i++)
+		uncharge_folio(folios->folios[i], &ug);
+	if (ug.memcg)
+		uncharge_batch(&ug);
+}
+
 /**
  * mem_cgroup_replace_folio - Charge a folio's replacement.
  * @old: Currently circulating folio.
-- 
Gitee


From ee6e025b45a2fed88e5892a894acbd46ca7910ef Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:00 +0800
Subject: [PATCH 06/37] mm: remove use of folio list from folios_put()

mainline inclusion
from mainline-v6.9-rc1
commit 7c33b8c4229af19797c78de48827ca70228c1f47
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7c33b8c4229af19797c78de48827ca70228c1f47

--------------------------------

Instead of putting the interesting folios on a list, delete the
uninteresting one from the folio_batch.

Link: https://lkml.kernel.org/r/20240227174254.710559-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 1cfb7b897ebd..ee8b131bf32c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -961,12 +961,11 @@ void lru_cache_disable(void)
  */
 void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 {
-	int i;
-	LIST_HEAD(pages_to_free);
+	int i, j;
 	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
-	for (i = 0; i < folios->nr; i++) {
+	for (i = 0, j = 0; i < folios->nr; i++) {
 		struct folio *folio = folios->folios[i];
 		unsigned int nr_refs = refs ? refs[i] : 1;
 
@@ -1016,14 +1015,20 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 			count_vm_event(UNEVICTABLE_PGCLEARED);
 		}
 
-		list_add(&folio->lru, &pages_to_free);
+		if (j != i)
+			folios->folios[j] = folio;
+		j++;
 	}
 	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
+	if (!j) {
+		folio_batch_reinit(folios);
+		return;
+	}
 
-	mem_cgroup_uncharge_list(&pages_to_free);
-	free_unref_page_list(&pages_to_free);
-	folio_batch_reinit(folios);
+	folios->nr = j;
+	mem_cgroup_uncharge_folios(folios);
+	free_unref_folios(folios);
 }
 EXPORT_SYMBOL(folios_put_refs);
 
-- 
Gitee


From b760d26906dda11126915e62d36e26d9c45655b4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:01 +0800
Subject: [PATCH 07/37] mm: use free_unref_folios() in put_pages_list()

mainline inclusion
from mainline-v6.9-rc1
commit 24835f899c0129a4733e899e4da20e2e72f40bd9
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=24835f899c0129a4733e899e4da20e2e72f40bd9

--------------------------------

Break up the list of folios into batches here so that the folios are more
likely to be cache hot when doing the rest of the processing.

Link: https://lkml.kernel.org/r/20240227174254.710559-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index ee8b131bf32c..ad3f2e9448a4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -138,22 +138,25 @@ EXPORT_SYMBOL(__folio_put);
  */
 void put_pages_list(struct list_head *pages)
 {
-	struct folio *folio, *next;
+	struct folio_batch fbatch;
+	struct folio *folio;
 
-	list_for_each_entry_safe(folio, next, pages, lru) {
-		if (!folio_put_testzero(folio)) {
-			list_del(&folio->lru);
+	folio_batch_init(&fbatch);
+	list_for_each_entry(folio, pages, lru) {
+		if (!folio_put_testzero(folio))
 			continue;
-		}
 		if (folio_test_large(folio)) {
-			list_del(&folio->lru);
 			__folio_put_large(folio);
 			continue;
 		}
 		/* LRU flag must be clear because it's passed using the lru */
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		free_unref_folios(&fbatch);
 	}
 
-	free_unref_page_list(pages);
+	if (fbatch.nr)
+		free_unref_folios(&fbatch);
 	INIT_LIST_HEAD(pages);
 }
 EXPORT_SYMBOL(put_pages_list);
-- 
Gitee


From f9d5943bb0673133d65bc9de8ea99dd44b296235 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:02 +0800
Subject: [PATCH 08/37] mm: use __page_cache_release() in folios_put()

mainline inclusion
from mainline-v6.9-rc1
commit f1ee018baee9f4e724e08859c2559323be768be3
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1ee018baee9f4e724e08859c2559323be768be3

--------------------------------

Pass a pointer to the lruvec so we can take advantage of the
folio_lruvec_relock_irqsave().  Adjust the calling convention of
folio_lruvec_relock_irqsave() to suit and add a page_cache_release()
wrapper.

Link: https://lkml.kernel.org/r/20240227174254.710559-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/memcontrol.h | 16 +++++-----
 mm/swap.c                  | 62 ++++++++++++++++++--------------------
 2 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d137a1286777..7381a34d9c17 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1839,18 +1839,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
 	return folio_lruvec_lock_irq(folio);
 }
 
-/* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
-		struct lruvec *locked_lruvec, unsigned long *flags)
+/* Don't lock again iff folio's lruvec locked */
+static inline void folio_lruvec_relock_irqsave(struct folio *folio,
+		struct lruvec **lruvecp, unsigned long *flags)
 {
-	if (locked_lruvec) {
-		if (folio_matches_lruvec(folio, locked_lruvec))
-			return locked_lruvec;
+	if (*lruvecp) {
+		if (folio_matches_lruvec(folio, *lruvecp))
+			return;
 
-		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+		unlock_page_lruvec_irqrestore(*lruvecp, *flags);
 	}
 
-	return folio_lruvec_lock_irqsave(folio, flags);
+	*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/swap.c b/mm/swap.c
index ad3f2e9448a4..dce5ea67ae05 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -74,22 +74,21 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
-/*
- * This path almost never happens for VM activity - pages are normally freed
- * in batches.  But it gets used by networking - and for compound pages.
- */
-static void __page_cache_release(struct folio *folio)
+static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
+		unsigned long *flagsp)
 {
 	if (folio_test_lru(folio)) {
-		struct lruvec *lruvec;
-		unsigned long flags;
-
-		lruvec = folio_lruvec_lock_irqsave(folio, &flags);
-		lruvec_del_folio(lruvec, folio);
+		folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
+		lruvec_del_folio(*lruvecp, folio);
 		__folio_clear_lru_flags(folio);
-		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
-	/* See comment on folio_test_mlocked in folios_put() */
+
+	/*
+	 * In rare cases, when truncation or holepunching raced with
+	 * munlock after VM_LOCKED was cleared, Mlocked may still be
+	 * found set here.  This does not indicate a problem, unless
+	 * "unevictable_pgs_cleared" appears worryingly large.
+	 */
 	if (unlikely(folio_test_mlocked(folio))) {
 		long nr_pages = folio_nr_pages(folio);
 
@@ -99,9 +98,23 @@ static void __page_cache_release(struct folio *folio)
 	}
 }
 
+/*
+ * This path almost never happens for VM activity - pages are normally freed
+ * in batches.  But it gets used by networking - and for compound pages.
+ */
+static void page_cache_release(struct folio *folio)
+{
+	struct lruvec *lruvec = NULL;
+	unsigned long flags;
+
+	__page_cache_release(folio, &lruvec, &flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
+}
+
 static void __folio_put_small(struct folio *folio)
 {
-	__page_cache_release(folio);
+	page_cache_release(folio);
 	mem_cgroup_uncharge(folio);
 	free_unref_page(&folio->page, 0);
 }
@@ -115,7 +128,7 @@ static void __folio_put_large(struct folio *folio)
 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
 	 */
 	if (!folio_test_hugetlb(folio))
-		__page_cache_release(folio);
+		page_cache_release(folio);
 	destroy_large_folio(folio);
 }
 
@@ -216,7 +229,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 		if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
 			continue;
 
-		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+		folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
 		move_fn(lruvec, folio);
 
 		folio_set_lru(folio);
@@ -999,24 +1012,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 			continue;
 		}
 
-		if (folio_test_lru(folio)) {
-			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
-									&flags);
-			lruvec_del_folio(lruvec, folio);
-			__folio_clear_lru_flags(folio);
-		}
-
-		/*
-		 * In rare cases, when truncation or holepunching raced with
-		 * munlock after VM_LOCKED was cleared, Mlocked may still be
-		 * found set here.  This does not indicate a problem, unless
-		 * "unevictable_pgs_cleared" appears worryingly large.
-		 */
-		if (unlikely(folio_test_mlocked(folio))) {
-			__folio_clear_mlocked(folio);
-			zone_stat_sub_folio(folio, NR_MLOCK);
-			count_vm_event(UNEVICTABLE_PGCLEARED);
-		}
+		__page_cache_release(folio, &lruvec, &flags);
 
 		if (j != i)
 			folios->folios[j] = folio;
-- 
Gitee


From e34d001c5b810227f944839f27b1cd3d1a3b8a1d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:03 +0800
Subject: [PATCH 09/37] mm: handle large folios in free_unref_folios()

mainline inclusion
from mainline-v6.9-rc1
commit 31b2ff82aefb33ce92496a1becddd6ce51060db2
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=31b2ff82aefb33ce92496a1becddd6ce51060db2

--------------------------------

Call folio_undo_large_rmappable() if needed.  free_unref_page_prepare()
destroys the ability to call folio_order(), so stash the order in
folio->private for the benefit of the second loop.

Link: https://lkml.kernel.org/r/20240227174254.710559-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/page_alloc.c
[ Conflicts with dynamic pool, fix the code in free_unref_folios(). ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/page_alloc.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e0a11a0b62a..b09e69f14925 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2555,7 +2555,7 @@ void free_unref_page(struct page *page, unsigned int order)
 }
 
 /*
- * Free a batch of 0-order pages
+ * Free a batch of folios
  */
 void free_unref_folios(struct folio_batch *folios)
 {
@@ -2568,25 +2568,30 @@ void free_unref_folios(struct folio_batch *folios)
 	for (i = 0, j = 0; i < folios->nr; i++) {
 		struct folio *folio = folios->folios[i];
 		unsigned long pfn = folio_pfn(folio);
+		unsigned int order = folio_order(folio);
 
 		if (page_from_dynamic_pool(&folio->page)) {
 			dynamic_pool_free_page(&folio->page);
 			continue;
 		}
 
-		if (!free_unref_page_prepare(&folio->page, pfn, 0))
+		if (order > 0 && folio_test_large_rmappable(folio))
+			folio_undo_large_rmappable(folio);
+		if (!free_unref_page_prepare(&folio->page, pfn, order))
 			continue;
 
 		/*
-		 * Free isolated folios directly to the allocator, see
-		 * comment in free_unref_page.
+		 * Free isolated folios and orders not handled on the PCP
+		 * directly to the allocator, see comment in free_unref_page.
 		 */
 		migratetype = get_pcppage_migratetype(&folio->page);
-		if (unlikely(is_migrate_isolate(migratetype))) {
+		if (!pcp_allowed_order(order) ||
+		    is_migrate_isolate(migratetype)) {
 			free_one_page(folio_zone(folio), &folio->page, pfn,
-					0, migratetype, FPI_NONE);
+					order, migratetype, FPI_NONE);
 			continue;
 		}
+		folio->private = (void *)(unsigned long)order;
 		if (j != i)
 			folios->folios[j] = folio;
 		j++;
@@ -2596,7 +2601,9 @@ void free_unref_folios(struct folio_batch *folios)
 	for (i = 0; i < folios->nr; i++) {
 		struct folio *folio = folios->folios[i];
 		struct zone *zone = folio_zone(folio);
+		unsigned int order = (unsigned long)folio->private;
 
+		folio->private = NULL;
 		migratetype = get_pcppage_migratetype(&folio->page);
 
 		/* Different zone requires a different pcp lock */
@@ -2615,7 +2622,7 @@ void free_unref_folios(struct folio_batch *folios)
 			if (unlikely(!pcp)) {
 				pcp_trylock_finish(UP_flags);
 				free_one_page(zone, &folio->page,
-						folio_pfn(folio), 0,
+						folio_pfn(folio), order,
 						migratetype, FPI_NONE);
 				locked_zone = NULL;
 				continue;
@@ -2631,7 +2638,8 @@ void free_unref_folios(struct folio_batch *folios)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(&folio->page);
-		free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0);
+		free_unref_page_commit(zone, pcp, &folio->page, migratetype,
+				order);
 	}
 
 	if (pcp) {
-- 
Gitee


From 47fe258983a4647e11168e0a2ede8e2e8b617a31 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:04 +0800
Subject: [PATCH 10/37] mm: allow non-hugetlb large folios to be batch
 processed

mainline inclusion
from mainline-v6.9-rc1
commit f77171d241e379ea93448a53d58104191e02135c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f77171d241e379ea93448a53d58104191e02135c

--------------------------------

Hugetlb folios still get special treatment, but normal large folios can
now be freed by free_unref_folios().  This should have a reasonable
performance impact, TBD.

Link: https://lkml.kernel.org/r/20240227174254.710559-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index dce5ea67ae05..6b697d33fa5b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1003,12 +1003,13 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 		if (!folio_ref_sub_and_test(folio, nr_refs))
 			continue;
 
-		if (folio_test_large(folio)) {
+		/* hugetlb has its own memcg */
+		if (folio_test_hugetlb(folio)) {
 			if (lruvec) {
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			__folio_put_large(folio);
+			free_huge_folio(folio);
 			continue;
 		}
 
-- 
Gitee


From c263ce1016fe71ae59ee734b11cfd518910a9d4c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:05 +0800
Subject: [PATCH 11/37] mm: free folios in a batch in shrink_folio_list()

mainline inclusion
from mainline-v6.9-rc1
commit bc2ff4cbc3294c01f29449405c42ee26ee0e1f59
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bc2ff4cbc3294c01f29449405c42ee26ee0e1f59

--------------------------------

Use free_unref_page_batch() to free the folios.  This may increase the
number of IPIs from calling try_to_unmap_flush() more often, but that's
going to be very workload-dependent.  It may even reduce the number of
IPIs as we now batch-free large folios instead of freeing them one at a
time.

Link: https://lkml.kernel.org/r/20240227174254.710559-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/vmscan.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 34614bb7062d..f3ed46d5c7d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1711,14 +1711,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
 		struct reclaim_stat *stat, bool ignore_references)
 {
+	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
-	LIST_HEAD(free_folios);
 	LIST_HEAD(demote_folios);
 	unsigned int nr_reclaimed = 0;
 	unsigned int pgactivate = 0;
 	bool do_demote_pass;
 	struct swap_iocb *plug = NULL;
 
+	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
 	do_demote_pass = can_demote(pgdat->node_id, sc);
@@ -2134,14 +2135,11 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		 */
 		nr_reclaimed += nr_pages;
 
-		/*
-		 * Is there need to periodically free_folio_list? It would
-		 * appear not as the counts should be low
-		 */
-		if (unlikely(folio_test_large(folio)))
-			destroy_large_folio(folio);
-		else
-			list_add(&folio->lru, &free_folios);
+		if (folio_batch_add(&free_folios, folio) == 0) {
+			mem_cgroup_uncharge_folios(&free_folios);
+			try_to_unmap_flush();
+			free_unref_folios(&free_folios);
+		}
 		continue;
 
 activate_locked_split:
@@ -2205,9 +2203,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 
 	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
 
-	mem_cgroup_uncharge_list(&free_folios);
+	mem_cgroup_uncharge_folios(&free_folios);
 	try_to_unmap_flush();
-	free_unref_page_list(&free_folios);
+	free_unref_folios(&free_folios);
 
 	list_splice(&ret_folios, folio_list);
 	count_vm_events(PGACTIVATE, pgactivate);
-- 
Gitee


From 533739e8db72c3efc44c2df56c9c1139d7193812 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:06 +0800
Subject: [PATCH 12/37] mm: free folios directly in move_folios_to_lru()

mainline inclusion
from mainline-v6.9-rc1
commit 29f3843026cf83414a8bc319c97c1d09a6c33f4e
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=29f3843026cf83414a8bc319c97c1d09a6c33f4e

--------------------------------

The few folios which can't be moved to the LRU list (because their
refcount dropped to zero) used to be returned to the caller to dispose of.
Make this simpler to call by freeing the folios directly through
free_unref_folios().

Link: https://lkml.kernel.org/r/20240227174254.710559-13-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/vmscan.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f3ed46d5c7d7..d88f9b8fd5d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2504,7 +2504,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 
 /*
  * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
- * On return, @list is reused as a list of folios to be freed by the caller.
  *
  * Returns the number of pages moved to the given lruvec.
  */
@@ -2512,8 +2511,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
 		struct list_head *list)
 {
 	int nr_pages, nr_moved = 0;
-	LIST_HEAD(folios_to_free);
+	struct folio_batch free_folios;
 
+	folio_batch_init(&free_folios);
 	while (!list_empty(list)) {
 		struct folio *folio = lru_to_folio(list);
 
@@ -2542,12 +2542,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
 		if (unlikely(folio_put_testzero(folio))) {
 			__folio_clear_lru_flags(folio);
 
-			if (unlikely(folio_test_large(folio))) {
+			if (folio_batch_add(&free_folios, folio) == 0) {
 				spin_unlock_irq(&lruvec->lru_lock);
-				destroy_large_folio(folio);
+				mem_cgroup_uncharge_folios(&free_folios);
+				free_unref_folios(&free_folios);
 				spin_lock_irq(&lruvec->lru_lock);
-			} else
-				list_add(&folio->lru, &folios_to_free);
+			}
 
 			continue;
 		}
@@ -2564,10 +2564,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
 			workingset_age_nonresident(lruvec, nr_pages);
 	}
 
-	/*
-	 * To save our caller's stack, now use input list for pages to free.
-	 */
-	list_splice(&folios_to_free, list);
+	if (free_folios.nr) {
+		spin_unlock_irq(&lruvec->lru_lock);
+		mem_cgroup_uncharge_folios(&free_folios);
+		free_unref_folios(&free_folios);
+		spin_lock_irq(&lruvec->lru_lock);
+	}
 
 	return nr_moved;
 }
@@ -2646,8 +2648,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	spin_unlock_irq(&lruvec->lru_lock);
 
 	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
-	mem_cgroup_uncharge_list(&folio_list);
-	free_unref_page_list(&folio_list);
 
 	/*
 	 * If dirty folios are scanned that are not queued for IO, it
@@ -2788,8 +2788,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	nr_activate = move_folios_to_lru(lruvec, &l_active);
 	nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
-	/* Keep all free folios in l_active list */
-	list_splice(&l_inactive, &l_active);
 
 	__count_vm_events(PGDEACTIVATE, nr_deactivate);
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
@@ -2799,8 +2797,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	if (nr_rotated)
 		lru_note_cost(lruvec, file, 0, nr_rotated);
-	mem_cgroup_uncharge_list(&l_active);
-	free_unref_page_list(&l_active);
 	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
@@ -5289,10 +5285,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 
 	spin_unlock_irq(&lruvec->lru_lock);
 
-	mem_cgroup_uncharge_list(&list);
-	free_unref_page_list(&list);
-
-	INIT_LIST_HEAD(&list);
 	list_splice_init(&clean, &list);
 
 	if (!list_empty(&list)) {
-- 
Gitee


From b93c893bca03a1beb5b131d47e168e52bcad3a37 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:07 +0800
Subject: [PATCH 13/37] memcg: remove mem_cgroup_uncharge_list()

mainline inclusion
from mainline-v6.9-rc1
commit be5a9e17a2ccbecfb7020aa1938e2c62d8a9189c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be5a9e17a2ccbecfb7020aa1938e2c62d8a9189c

--------------------------------

All users have been converted to mem_cgroup_uncharge_folios() so we can
remove this API.

Link: https://lkml.kernel.org/r/20240227174254.710559-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/memcontrol.h | 12 ------------
 mm/memcontrol.c            | 19 -------------------
 2 files changed, 31 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7381a34d9c17..b2a80e089a0a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -829,14 +829,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio)
 	__mem_cgroup_uncharge(folio);
 }
 
-void __mem_cgroup_uncharge_list(struct list_head *page_list);
-static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
-{
-	if (mem_cgroup_disabled())
-		return;
-	__mem_cgroup_uncharge_list(page_list);
-}
-
 void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
 static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
 {
@@ -1427,10 +1419,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio)
 {
 }
 
-static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
-{
-}
-
 static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a3824f608254..9951efb06d7b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -8583,25 +8583,6 @@ void __mem_cgroup_uncharge(struct folio *folio)
 	uncharge_batch(&ug);
 }
 
-/**
- * __mem_cgroup_uncharge_list - uncharge a list of page
- * @page_list: list of pages to uncharge
- *
- * Uncharge a list of pages previously charged with
- * __mem_cgroup_charge().
- */
-void __mem_cgroup_uncharge_list(struct list_head *page_list)
-{
-	struct uncharge_gather ug;
-	struct folio *folio;
-
-	uncharge_gather_clear(&ug);
-	list_for_each_entry(folio, page_list, lru)
-		uncharge_folio(folio, &ug);
-	if (ug.memcg)
-		uncharge_batch(&ug);
-}
-
 void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
 {
 	struct uncharge_gather ug;
-- 
Gitee


From 94d0c887383696b5330855609c84fdc5b6017192 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:08 +0800
Subject: [PATCH 14/37] mm: remove free_unref_page_list()

mainline inclusion
from mainline-v6.9-rc1
commit 8b7b0a5eee22e3cd0468944d0720120c36340a2b
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8b7b0a5eee22e3cd0468944d0720120c36340a2b

--------------------------------

All callers now use free_unref_folios() so we can delete this function.

Link: https://lkml.kernel.org/r/20240227174254.710559-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/internal.h   |  1 -
 mm/page_alloc.c | 18 ------------------
 2 files changed, 19 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9d02b5a630e4..713b8f8fb5c2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -637,7 +637,6 @@ extern int user_min_free_kbytes;
 
 void free_unref_page(struct page *page, unsigned int order);
 void free_unref_folios(struct folio_batch *fbatch);
-void free_unref_page_list(struct list_head *list);
 
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b09e69f14925..315c3fdaa62b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2649,24 +2649,6 @@ void free_unref_folios(struct folio_batch *folios)
 	folio_batch_reinit(folios);
 }
 
-void free_unref_page_list(struct list_head *list)
-{
-	struct folio_batch fbatch;
-
-	folio_batch_init(&fbatch);
-	while (!list_empty(list)) {
-		struct folio *folio = list_first_entry(list, struct folio, lru);
-
-		list_del(&folio->lru);
-		if (folio_batch_add(&fbatch, folio) > 0)
-			continue;
-		free_unref_folios(&fbatch);
-	}
-
-	if (fbatch.nr)
-		free_unref_folios(&fbatch);
-}
-
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
-- 
Gitee


From 65090f1bb53dcfe9f6068251b54f6bea6d41eb63 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:09 +0800
Subject: [PATCH 15/37] mm: remove lru_to_page()

mainline inclusion
from mainline-v6.9-rc1
commit f39ec4dcb9e9b03b2a280829b8c15e3ae607398c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f39ec4dcb9e9b03b2a280829b8c15e3ae607398c

--------------------------------

The last user was removed over a year ago; remove the definition.

Link: https://lkml.kernel.org/r/20240227174254.710559-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca70fe2405c7..0d570b3eb301 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -228,7 +228,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
 #define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
 
-#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 static inline struct folio *lru_to_folio(struct list_head *head)
 {
 	return list_entry((head)->prev, struct folio, lru);
-- 
Gitee


From be773a95969ec987e348fbc706da5728a990e201 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:10 +0800
Subject: [PATCH 16/37] mm: convert free_pages_and_swap_cache() to use
 folios_put()

mainline inclusion
from mainline-v6.9-rc1
commit 4907e80b76af004b6af42f0d4131e23ac73bc07c
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4907e80b76af004b6af42f0d4131e23ac73bc07c

--------------------------------

Process the pages in batch-sized quantities instead of all-at-once.

Link: https://lkml.kernel.org/r/20240227174254.710559-17-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap_state.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 40b84dc47974..d93ae705998d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/swapops.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/migrate.h>
@@ -310,21 +311,25 @@ void free_page_and_swap_cache(struct page *page)
  */
 void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 {
+	struct folio_batch folios;
+	unsigned int refs[PAGEVEC_SIZE];
+
 	lru_add_drain();
+	folio_batch_init(&folios);
 	for (int i = 0; i < nr; i++) {
-		struct page *page = encoded_page_ptr(pages[i]);
+		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
 
-		/*
-		 * Skip over the "nr_pages" entry. It's sufficient to call
-		 * free_swap_cache() only once per folio.
-		 */
+		free_swap_cache(&folio->page);
+		refs[folios.nr] = 1;
 		if (unlikely(encoded_page_flags(pages[i]) &
 			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-			i++;
+			refs[folios.nr] = encoded_nr_pages(pages[++i]);
 
-		free_swap_cache(page);
+		if (folio_batch_add(&folios, folio) == 0)
+			folios_put_refs(&folios, refs);
 	}
-	release_pages(pages, nr);
+	if (folios.nr)
+		folios_put_refs(&folios, refs);
 }
 
 static inline bool swap_use_vma_readahead(void)
-- 
Gitee


From 6a9c64d0101bdaece7d1951973ec5481ae688c52 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:11 +0800
Subject: [PATCH 17/37] mm: use a folio in
 __collapse_huge_page_copy_succeeded()

mainline inclusion
from mainline-v6.9-rc1
commit d4111eecdc3c2a5eabafcc467dbfce0e216fa485
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d4111eecdc3c2a5eabafcc467dbfce0e216fa485

--------------------------------

These pages are all chained together through the lru list, so we know
they're folios.  Use the folio APIs to save three hidden calls to
compound_head().

Link: https://lkml.kernel.org/r/20240227174254.710559-18-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/khugepaged.c
[ Context conflicts with memory reliable. ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/khugepaged.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7d329e9eeec8..75dcd160735b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -695,9 +695,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 						spinlock_t *ptl,
 						struct list_head *compound_pagelist)
 {
-	struct folio *src_folio;
-	struct page *src_page;
-	struct page *tmp;
+	struct folio *src, *tmp;
 	pte_t *_pte;
 	pte_t pteval;
 
@@ -716,10 +714,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
 			}
 		} else {
-			src_page = pte_page(pteval);
-			src_folio = page_folio(src_page);
-			if (!folio_test_large(src_folio))
-				release_pte_folio(src_folio);
+			struct page *src_page = pte_page(pteval);
+
+			src = page_folio(src_page);
+			if (!folio_test_large(src))
+				release_pte_folio(src);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
 			 * be disabled to update the per-cpu stats
@@ -728,20 +727,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 			spin_lock(ptl);
 			ptep_clear(vma->vm_mm, address, _pte);
 			add_reliable_page_counter(src_page, vma->vm_mm, 1);
-			folio_remove_rmap_pte(src_folio, src_page, vma);
+			folio_remove_rmap_pte(src, src_page, vma);
 			spin_unlock(ptl);
 			free_page_and_swap_cache(src_page);
 		}
 	}
 
-	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
-		list_del(&src_page->lru);
-		mod_node_page_state(page_pgdat(src_page),
-				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
-				    -compound_nr(src_page));
-		unlock_page(src_page);
-		free_swap_cache(src_page);
-		putback_lru_page(src_page);
+	list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
+		list_del(&src->lru);
+		node_stat_sub_folio(src, NR_ISOLATED_ANON +
+				folio_is_file_lru(src));
+		folio_unlock(src);
+		free_swap_cache(&src->page);
+		folio_putback_lru(src);
 	}
 }
 
-- 
Gitee


From 6a7ecc2ae3dd58d733c652b1e827fff90bc6c58f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:12 +0800
Subject: [PATCH 18/37] mm: convert free_swap_cache() to take a folio

mainline inclusion
from mainline-v6.9-rc1
commit 63b774993dd02b17127cb404b7362fc436632995
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=63b774993dd02b17127cb404b7362fc436632995

--------------------------------

All but one caller already has a folio, so convert
free_page_and_swap_cache() to have a folio and remove the call to
page_folio().

Link: https://lkml.kernel.org/r/20240227174254.710559-19-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/swap.h |  8 ++++----
 mm/khugepaged.c      |  2 +-
 mm/memory.c          |  2 +-
 mm/swap_state.c      | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 13cd68b5f5e2..54fa8f4558c7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -482,9 +482,9 @@ static inline unsigned long total_swapcache_pages(void)
 	return global_node_page_state(NR_SWAPCACHE);
 }
 
-extern void free_swap_cache(struct page *page);
-extern void free_page_and_swap_cache(struct page *);
-extern void free_pages_and_swap_cache(struct encoded_page **, int);
+void free_swap_cache(struct folio *folio);
+void free_page_and_swap_cache(struct page *);
+void free_pages_and_swap_cache(struct encoded_page **, int);
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
@@ -577,7 +577,7 @@ static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 {
 }
 
-static inline void free_swap_cache(struct page *page)
+static inline void free_swap_cache(struct folio *folio)
 {
 }
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 75dcd160735b..5f999528ec30 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -738,7 +738,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 		node_stat_sub_folio(src, NR_ISOLATED_ANON +
 				folio_is_file_lru(src));
 		folio_unlock(src);
-		free_swap_cache(&src->page);
+		free_swap_cache(src);
 		folio_putback_lru(src);
 	}
 }
diff --git a/mm/memory.c b/mm/memory.c
index 4ef917a182f9..ed275401e695 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3390,7 +3390,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		folio_put(new_folio);
 	if (old_folio) {
 		if (page_copied)
-			free_swap_cache(&old_folio->page);
+			free_swap_cache(old_folio);
 		folio_put(old_folio);
 	}
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d93ae705998d..94c9f171e94d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -283,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
  * folio_free_swap() _with_ the lock.
  * 					- Marcelo
  */
-void free_swap_cache(struct page *page)
+void free_swap_cache(struct folio *folio)
 {
-	struct folio *folio = page_folio(page);
-
 	if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
 	    folio_trylock(folio)) {
 		folio_free_swap(folio);
@@ -300,9 +298,11 @@ void free_swap_cache(struct page *page)
  */
 void free_page_and_swap_cache(struct page *page)
 {
-	free_swap_cache(page);
+	struct folio *folio = page_folio(page);
+
+	free_swap_cache(folio);
 	if (!is_huge_zero_page(page))
-		put_page(page);
+		folio_put(folio);
 }
 
 /*
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 	for (int i = 0; i < nr; i++) {
 		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
 
-		free_swap_cache(&folio->page);
+		free_swap_cache(folio);
 		refs[folios.nr] = 1;
 		if (unlikely(encoded_page_flags(pages[i]) &
 			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-- 
Gitee


From bd12e033ace86b03f7b946cbf11b58181990b0cf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:13 +0800
Subject: [PATCH 19/37] mm: remove folio from deferred split list before
 uncharging it

mainline inclusion
from mainline-v6.9-rc1
commit 47932e7048df9156e96133ee90fb3e9df68dbd15
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47932e7048df9156e96133ee90fb3e9df68dbd15

--------------------------------

When freeing a large folio, we must remove it from the deferred split list
before we uncharge it as each memcg has its own deferred split list (with
associated lock) and removing a folio from the deferred split list while
holding the wrong lock will corrupt that list and cause various related
problems.

Link: https://lore.kernel.org/linux-mm/367a14f7-340e-4b29-90ae-bc3fcefdd5f4@arm.com/
Link: https://lkml.kernel.org/r/20240311191835.312162-1-willy@infradead.org
Fixes: f77171d241e3 (mm: allow non-hugetlb large folios to be batch processed)
Fixes: 29f3843026cf (mm: free folios directly in move_folios_to_lru())
Fixes: bc2ff4cbc329 (mm: free folios in a batch in shrink_folio_list())
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Debugged-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c   | 3 +++
 mm/vmscan.c | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/mm/swap.c b/mm/swap.c
index 6b697d33fa5b..e43a5911b170 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1012,6 +1012,9 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 			free_huge_folio(folio);
 			continue;
 		}
+		if (folio_test_large(folio) &&
+		    folio_test_large_rmappable(folio))
+			folio_undo_large_rmappable(folio);
 
 		__page_cache_release(folio, &lruvec, &flags);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d88f9b8fd5d3..44154c63ec6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2135,6 +2135,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		 */
 		nr_reclaimed += nr_pages;
 
+		if (folio_test_large(folio) &&
+		    folio_test_large_rmappable(folio))
+			folio_undo_large_rmappable(folio);
 		if (folio_batch_add(&free_folios, folio) == 0) {
 			mem_cgroup_uncharge_folios(&free_folios);
 			try_to_unmap_flush();
@@ -2542,6 +2545,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
 		if (unlikely(folio_put_testzero(folio))) {
 			__folio_clear_lru_flags(folio);
 
+			if (folio_test_large(folio) &&
+			    folio_test_large_rmappable(folio))
+				folio_undo_large_rmappable(folio);
 			if (folio_batch_add(&free_folios, folio) == 0) {
 				spin_unlock_irq(&lruvec->lru_lock);
 				mem_cgroup_uncharge_folios(&free_folios);
-- 
Gitee


From 2553aac153346f93e7afa5289ec5de69a634c314 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:14 +0800
Subject: [PATCH 20/37] mm: fix list corruption in put_pages_list

mainline inclusion
from mainline-v6.9-rc1
commit b555895c313511830762dbb2f469587a822c1759
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b555895c313511830762dbb2f469587a822c1759

--------------------------------

My recent change to put_pages_list() dereferences folio->lru.next after
returning the folio to the page allocator.  Usually this is now on the pcp
list with other free folios, so we try to free an already-free folio.
This only happens with lists that have more than 15 entries, so it wasn't
immediately discovered.  Revert to using list_for_each_safe() so we
dereference lru.next before disposing of the folio.

Link: https://lkml.kernel.org/r/20240306212749.1823380-1-willy@infradead.org
Fixes: 24835f899c01 ("mm: use free_unref_folios() in put_pages_list()")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reported-by: "Borah, Chaitanya Kumar" <chaitanya.kumar.borah@intel.com>
Closes: https://lore.kernel.org/intel-gfx/SJ1PR11MB61292145F3B79DA58ADDDA63B9232@SJ1PR11MB6129.namprd11.prod.outlook.com/
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index e43a5911b170..500a09a48dfd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -152,10 +152,10 @@ EXPORT_SYMBOL(__folio_put);
 void put_pages_list(struct list_head *pages)
 {
 	struct folio_batch fbatch;
-	struct folio *folio;
+	struct folio *folio, *next;
 
 	folio_batch_init(&fbatch);
-	list_for_each_entry(folio, pages, lru) {
+	list_for_each_entry_safe(folio, next, pages, lru) {
 		if (!folio_put_testzero(folio))
 			continue;
 		if (folio_test_large(folio)) {
-- 
Gitee


From 421edb8f5b3d5485d83e48e5b218ad68ea79d182 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:15 +0800
Subject: [PATCH 21/37] mm: increase folio batch size

mainline inclusion
from mainline-v6.9-rc2
commit 9cecde80aae0fb0aa44425575d5aca71bc646d89
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9cecde80aae0fb0aa44425575d5aca71bc646d89

--------------------------------

On a 104 thread, 2 socket Skylake system, Intel report a 4.7% performance
reduction with will-it-scale page_fault2.  This was due to reducing the
size of the batch from 32 to 15.  Increasing the folio batch size from 15
to 31 gives a performance increase of 12.5% relative to the original, or
17.2% relative to the reduced performance commit.

The penalty of this commit is an additional 128 bytes of stack usage.  Six
folio_batches are also allocated from percpu memory in cpu_fbatches so
that will be an additional 768 bytes of percpu memory (per CPU).  Tim Chen
originally submitted a patch like this in 2020:
https://lore.kernel.org/linux-mm/d1cc9f12a8ad6c2a52cb600d93b06b064f2bbc57.1593205965.git.tim.c.chen@linux.intel.com/

Link: https://lkml.kernel.org/r/20240315140823.2478146-1-willy@infradead.org
Fixes: 99fbb6bfc16f ("mm: make folios_put() the basis of release_pages()")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Yujie Liu <yujie.liu@intel.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202403151058.7048f6a8-oliver.sang@intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/pagevec.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 87cc678adc85..67f10b8810a8 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -11,8 +11,8 @@
 
 #include <linux/types.h>
 
-/* 15 pointers + header align the folio_batch structure to a power of two */
-#define PAGEVEC_SIZE	15
+/* 31 pointers + header align the folio_batch structure to a power of two */
+#define PAGEVEC_SIZE	31
 
 struct folio;
 
-- 
Gitee


From 76934cf0ea23f78157834ea5e1d16737337d5d14 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:16 +0800
Subject: [PATCH 22/37] mm: free non-hugetlb large folios in a batch

mainline inclusion
from mainline-v6.10-rc1
commit 2f166704290eadec480209cf28060f154184afe9
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2f166704290eadec480209cf28060f154184afe9

--------------------------------

Patch series "Clean up __folio_put()".

With all the changes over the last few years, __folio_put_small and
__folio_put_large have become almost identical to each other ...  except
you can't tell because they're spread over two files.  Rearrange it all so
that you can tell, and then inline them both into __folio_put().

This patch (of 5):

free_unref_folios() can now handle non-hugetlb large folios, so keep
normal large folios in the batch.  hugetlb folios still need to be handled
specially.

[peterx@redhat.com: fix panic]
  Link: https://lkml.kernel.org/r/ZikjPB0Dt5HA8-uL@x1n
Link: https://lkml.kernel.org/r/20240405153228.2563754-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240405153228.2563754-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 500a09a48dfd..f5307db46062 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -158,8 +158,8 @@ void put_pages_list(struct list_head *pages)
 	list_for_each_entry_safe(folio, next, pages, lru) {
 		if (!folio_put_testzero(folio))
 			continue;
-		if (folio_test_large(folio)) {
-			__folio_put_large(folio);
+		if (folio_test_hugetlb(folio)) {
+			free_huge_folio(folio);
 			continue;
 		}
 		/* LRU flag must be clear because it's passed using the lru */
-- 
Gitee


From 62b208c4859c499e95a584bf6d37f6cb6b7382fc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:17 +0800
Subject: [PATCH 23/37] mm: combine free_the_page() and free_unref_page()

mainline inclusion
from mainline-v6.10-rc1
commit 5b8d75913a0ed9deb16140c0aa880c4d6db2dc62
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5b8d75913a0ed9deb16140c0aa880c4d6db2dc62

--------------------------------

The pcp_allowed_order() check in free_the_page() was only being skipped by
__folio_put_small() which is about to be rearranged.

Link: https://lkml.kernel.org/r/20240405153228.2563754-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/page_alloc.c
[ Context conflicts with commit 17edeb5d3f76 and cc92eba1c88b. ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/page_alloc.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 315c3fdaa62b..19ba7cf4e80d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -567,14 +567,6 @@ static inline bool pcp_allowed_order(unsigned int order)
 	return false;
 }
 
-static inline void free_the_page(struct page *page, unsigned int order)
-{
-	if (pcp_allowed_order(order))		/* Via pcp? */
-		free_unref_page(page, order);
-	else
-		__free_pages_ok(page, order, FPI_NONE);
-}
-
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
@@ -610,7 +602,7 @@ void destroy_large_folio(struct folio *folio)
 		folio_undo_large_rmappable(folio);
 
 	mem_cgroup_uncharge(folio);
-	free_the_page(&folio->page, folio_order(folio));
+	free_unref_page(&folio->page, folio_order(folio));
 }
 
 static inline void set_buddy_order(struct page *page, unsigned int order)
@@ -2523,6 +2515,11 @@ void free_unref_page(struct page *page, unsigned int order)
 		return;
 	}
 
+	if (!pcp_allowed_order(order)) {
+		__free_pages_ok(page, order, FPI_NONE);
+		return;
+	}
+
 	if (!free_unref_page_prepare(page, pfn, order))
 		return;
 
@@ -4909,10 +4906,10 @@ void __free_pages(struct page *page, unsigned int order)
 	int head = PageHead(page);
 
 	if (put_page_testzero(page))
-		free_the_page(page, order);
+		free_unref_page(page, order);
 	else if (!head)
 		while (order-- > 0)
-			free_the_page(page + (1 << order), order);
+			free_unref_page(page + (1 << order), order);
 }
 EXPORT_SYMBOL(__free_pages);
 
@@ -4963,7 +4960,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 
 	if (page_ref_sub_and_test(page, count))
-		free_the_page(page, compound_order(page));
+		free_unref_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
@@ -5004,7 +5001,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 			goto refill;
 
 		if (unlikely(nc->pfmemalloc)) {
-			free_the_page(page, compound_order(page));
+			free_unref_page(page, compound_order(page));
 			goto refill;
 		}
 
@@ -5048,7 +5045,7 @@ void page_frag_free(void *addr)
 	struct page *page = virt_to_head_page(addr);
 
 	if (unlikely(put_page_testzero(page)))
-		free_the_page(page, compound_order(page));
+		free_unref_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
Gitee


From dc6b4c72768648cfb126f03b3c08a7f22f6e3f7a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:18 +0800
Subject: [PATCH 24/37] mm: inline destroy_large_folio() into
 __folio_put_large()

mainline inclusion
from mainline-v6.10-rc1
commit 2542b1ac9a46ac58f9565de0048457956898d481
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2542b1ac9a46ac58f9565de0048457956898d481

--------------------------------

destroy_large_folio() has only one caller, move its contents there.

Link: https://lkml.kernel.org/r/20240405153228.2563754-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/linux/mm.h |  2 --
 mm/page_alloc.c    | 14 --------------
 mm/swap.c          | 13 ++++++++++---
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0d570b3eb301..9b71b877c8d3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1321,8 +1321,6 @@ void folio_copy(struct folio *dst, struct folio *src);
 
 unsigned long nr_free_buffer_pages(void);
 
-void destroy_large_folio(struct folio *folio);
-
 /* Returns the number of bytes in this potentially compound page. */
 static inline unsigned long page_size(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 19ba7cf4e80d..9780cc3c8365 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -591,20 +591,6 @@ void prep_compound_page(struct page *page, unsigned int order)
 	prep_compound_head(page, order);
 }
 
-void destroy_large_folio(struct folio *folio)
-{
-	if (folio_test_hugetlb(folio)) {
-		free_huge_folio(folio);
-		return;
-	}
-
-	if (folio_test_large_rmappable(folio))
-		folio_undo_large_rmappable(folio);
-
-	mem_cgroup_uncharge(folio);
-	free_unref_page(&folio->page, folio_order(folio));
-}
-
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
diff --git a/mm/swap.c b/mm/swap.c
index f5307db46062..862a8e69def6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -127,9 +127,16 @@ static void __folio_put_large(struct folio *folio)
 	 * (it's never listed to any LRU lists) and no memcg routines should
 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
 	 */
-	if (!folio_test_hugetlb(folio))
-		page_cache_release(folio);
-	destroy_large_folio(folio);
+	if (folio_test_hugetlb(folio)) {
+		free_huge_folio(folio);
+		return;
+	}
+
+	page_cache_release(folio);
+	if (folio_test_large_rmappable(folio))
+		folio_undo_large_rmappable(folio);
+	mem_cgroup_uncharge(folio);
+	free_unref_page(&folio->page, folio_order(folio));
 }
 
 void __folio_put(struct folio *folio)
-- 
Gitee


From 08a8dae40bacd0f0180116b472d3abdf4bde78c4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:19 +0800
Subject: [PATCH 25/37] mm: combine __folio_put_small, __folio_put_large and
 __folio_put

mainline inclusion
from mainline-v6.10-rc1
commit 79a48287515848c18a49d75c1fdf176c82bb13cf
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=79a48287515848c18a49d75c1fdf176c82bb13cf

--------------------------------

It's now obvious that __folio_put_small() and __folio_put_large() do
almost exactly the same thing.  Inline them both into __folio_put().

Link: https://lkml.kernel.org/r/20240405153228.2563754-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/swap.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 862a8e69def6..f6921d3b4862 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -112,42 +112,22 @@ static void page_cache_release(struct folio *folio)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 }
 
-static void __folio_put_small(struct folio *folio)
-{
-	page_cache_release(folio);
-	mem_cgroup_uncharge(folio);
-	free_unref_page(&folio->page, 0);
-}
-
-static void __folio_put_large(struct folio *folio)
+void __folio_put(struct folio *folio)
 {
-	/*
-	 * __page_cache_release() is supposed to be called for thp, not for
-	 * hugetlb. This is because hugetlb page does never have PageLRU set
-	 * (it's never listed to any LRU lists) and no memcg routines should
-	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
-	 */
-	if (folio_test_hugetlb(folio)) {
+	if (unlikely(folio_is_zone_device(folio))) {
+		free_zone_device_page(&folio->page);
+		return;
+	} else if (folio_test_hugetlb(folio)) {
 		free_huge_folio(folio);
 		return;
 	}
 
 	page_cache_release(folio);
-	if (folio_test_large_rmappable(folio))
+	if (folio_test_large(folio) && folio_test_large_rmappable(folio))
 		folio_undo_large_rmappable(folio);
 	mem_cgroup_uncharge(folio);
 	free_unref_page(&folio->page, folio_order(folio));
 }
-
-void __folio_put(struct folio *folio)
-{
-	if (unlikely(folio_is_zone_device(folio)))
-		free_zone_device_page(&folio->page);
-	else if (unlikely(folio_test_large(folio)))
-		__folio_put_large(folio);
-	else
-		__folio_put_small(folio);
-}
 EXPORT_SYMBOL(__folio_put);
 
 /**
-- 
Gitee


From d5ae3c96ff011cb11fc6cd784a8b161e6d3517a7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:20 +0800
Subject: [PATCH 26/37] mm: convert free_zone_device_page to
 free_zone_device_folio

mainline inclusion
from mainline-v6.10-rc1
commit 9f100e3b37590828ae23b0210ee634d14b28b8e8
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9f100e3b37590828ae23b0210ee634d14b28b8e8

--------------------------------

Both callers already have a folio; pass it in and save a few calls to
compound_head().

Link: https://lkml.kernel.org/r/20240405153228.2563754-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/internal.h |  2 +-
 mm/memremap.c | 30 ++++++++++++++++--------------
 mm/swap.c     |  4 ++--
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 713b8f8fb5c2..fad7d8bf2e32 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1181,7 +1181,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end);
 int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
 		      unsigned long addr, int page_nid, int *flags);
 
-void free_zone_device_page(struct page *page);
+void free_zone_device_folio(struct folio *folio);
 int migrate_device_coherent_page(struct page *page);
 
 /*
diff --git a/mm/memremap.c b/mm/memremap.c
index bee85560a243..7b7e59841250 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -468,21 +468,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 }
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
-void free_zone_device_page(struct page *page)
+void free_zone_device_folio(struct folio *folio)
 {
-	if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
+	if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
+			!folio->page.pgmap->ops->page_free))
 		return;
 
-	mem_cgroup_uncharge(page_folio(page));
+	mem_cgroup_uncharge(folio);
 
 	/*
 	 * Note: we don't expect anonymous compound pages yet. Once supported
 	 * and we could PTE-map them similar to THP, we'd have to clear
 	 * PG_anon_exclusive on all tail pages.
 	 */
-	VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page);
-	if (PageAnon(page))
-		__ClearPageAnonExclusive(page);
+	if (folio_test_anon(folio)) {
+		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+		__ClearPageAnonExclusive(folio_page(folio, 0));
+	}
 
 	/*
 	 * When a device managed page is freed, the page->mapping field
@@ -503,20 +505,20 @@ void free_zone_device_page(struct page *page)
 	 *
 	 * For other types of ZONE_DEVICE pages, migration is either
 	 * handled differently or not done at all, so there is no need
-	 * to clear page->mapping.
+	 * to clear folio->mapping.
 	 */
-	page->mapping = NULL;
-	page->pgmap->ops->page_free(page);
+	folio->mapping = NULL;
+	folio->page.pgmap->ops->page_free(folio_page(folio, 0));
 
-	if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
-	    page->pgmap->type != MEMORY_DEVICE_COHERENT)
+	if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
+	    folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
 		/*
-		 * Reset the page count to 1 to prepare for handing out the page
+		 * Reset the refcount to 1 to prepare for handing out the page
 		 * again.
 		 */
-		set_page_count(page, 1);
+		folio_set_count(folio, 1);
 	else
-		put_dev_pagemap(page->pgmap);
+		put_dev_pagemap(folio->page.pgmap);
 }
 
 void zone_device_page_init(struct page *page)
diff --git a/mm/swap.c b/mm/swap.c
index f6921d3b4862..1c9e8f70d6b5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -115,7 +115,7 @@ static void page_cache_release(struct folio *folio)
 void __folio_put(struct folio *folio)
 {
 	if (unlikely(folio_is_zone_device(folio))) {
-		free_zone_device_page(&folio->page);
+		free_zone_device_folio(folio);
 		return;
 	} else if (folio_test_hugetlb(folio)) {
 		free_huge_folio(folio);
@@ -983,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 			if (put_devmap_managed_page_refs(&folio->page, nr_refs))
 				continue;
 			if (folio_ref_sub_and_test(folio, nr_refs))
-				free_zone_device_page(&folio->page);
+				free_zone_device_folio(folio);
 			continue;
 		}
 
-- 
Gitee


From 3c00cb424ac9fa4f8b3c67fb5ddc8c339e64fdee Mon Sep 17 00:00:00 2001
From: Richard Chang <richardycc@google.com>
Date: Wed, 22 May 2024 16:02:21 +0800
Subject: [PATCH 27/37] mm: add alloc_contig_migrate_range allocation
 statistics

mainline inclusion
from mainline-v6.9-rc1
commit c8b36003121834cb77fcaf8a1ce0a454d7a97891
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c8b36003121834cb77fcaf8a1ce0a454d7a97891

--------------------------------

alloc_contig_migrate_range has every information to be able to understand
big contiguous allocation latency.  For example, how many pages are
migrated, how many times they were needed to unmap from page tables.

This patch adds the trace event to collect the allocation statistics.  In
the field, it was quite useful to understand CMA allocation latency.

[akpm@linux-foundation.org: a/trace_mm_alloc_config_migrate_range_info_enabled/trace_mm_alloc_contig_migrate_range_info_enabled]
Link: https://lkml.kernel.org/r/20240228051127.2859472-1-richardycc@google.com
Signed-off-by: Richard Chang <richardycc@google.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org.
Cc: Martin Liu <liumartin@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 include/trace/events/kmem.h | 38 +++++++++++++++++++++++++++++++++++++
 mm/internal.h               |  3 ++-
 mm/page_alloc.c             | 32 ++++++++++++++++++++++++++-----
 mm/page_isolation.c         |  2 +-
 4 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index be39ca5af0ba..a4e40ae6a8c8 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -304,6 +304,44 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->change_ownership)
 );
 
+TRACE_EVENT(mm_alloc_contig_migrate_range_info,
+
+	TP_PROTO(unsigned long start,
+		 unsigned long end,
+		 unsigned long nr_migrated,
+		 unsigned long nr_reclaimed,
+		 unsigned long nr_mapped,
+		 int migratetype),
+
+	TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+		__field(unsigned long, nr_migrated)
+		__field(unsigned long, nr_reclaimed)
+		__field(unsigned long, nr_mapped)
+		__field(int, migratetype)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->nr_migrated = nr_migrated;
+		__entry->nr_reclaimed = nr_reclaimed;
+		__entry->nr_mapped = nr_mapped;
+		__entry->migratetype = migratetype;
+	),
+
+	TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
+		  __entry->start,
+		  __entry->end,
+		  __entry->migratetype,
+		  __entry->nr_migrated,
+		  __entry->nr_reclaimed,
+		  __entry->nr_mapped)
+);
+
 /*
  * Required for uniquely and securely identifying mm in rss_stat tracepoint.
  */
diff --git a/mm/internal.h b/mm/internal.h
index fad7d8bf2e32..6983493b997e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -725,7 +725,8 @@ isolate_migratepages_range(struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
 
 int __alloc_contig_migrate_range(struct compact_control *cc,
-					unsigned long start, unsigned long end);
+					unsigned long start, unsigned long end,
+					int migratetype);
 
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void init_cma_reserved_pageblock(struct page *page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9780cc3c8365..f450bc4a4637 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6458,9 +6458,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
 	}
 }
 
-/* [start, end) must belong to a single zone. */
+/*
+ * [start, end) must belong to a single zone.
+ * @migratetype: using migratetype to filter the type of migration in
+ *		trace_mm_alloc_contig_migrate_range_info.
+ */
 int __alloc_contig_migrate_range(struct compact_control *cc,
-					unsigned long start, unsigned long end)
+					unsigned long start, unsigned long end,
+					int migratetype)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned int nr_reclaimed;
@@ -6471,6 +6476,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
 		.nid = zone_to_nid(cc->zone),
 		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
 	};
+	struct page *page;
+	unsigned long total_mapped = 0;
+	unsigned long total_migrated = 0;
+	unsigned long total_reclaimed = 0;
 
 	lru_cache_disable();
 
@@ -6496,9 +6505,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 
+		if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
+			total_reclaimed += nr_reclaimed;
+			list_for_each_entry(page, &cc->migratepages, lru)
+				total_mapped += page_mapcount(page);
+		}
+
 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,
 			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
 
+		if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
+			total_migrated += cc->nr_migratepages;
+
 		/*
 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
 		 * to retry again over this error, so do the same here.
@@ -6512,9 +6530,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
 		if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
 			alloc_contig_dump_pages(&cc->migratepages);
 		putback_movable_pages(&cc->migratepages);
-		return ret;
 	}
-	return 0;
+
+	trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
+						 total_migrated,
+						 total_reclaimed,
+						 total_mapped);
+	return (ret < 0) ? ret : 0;
 }
 
 /**
@@ -6594,7 +6616,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	 * allocated.  So, if we fall through be sure to clear ret so that
 	 * -EBUSY is not accidentally used or returned to caller.
 	 */
-	ret = __alloc_contig_migrate_range(&cc, start, end);
+	ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
 	if (ret && ret != -EBUSY)
 		goto done;
 	ret = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index fefc8a926944..09eb445cfde9 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -437,7 +437,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				}
 
 				ret = __alloc_contig_migrate_range(&cc, head_pfn,
-							head_pfn + nr_pages);
+							head_pfn + nr_pages, page_mt);
 
 				/*
 				 * restore the page's migratetype so that it can
-- 
Gitee


From fb24953cdeb1aecb9428dfb55ec04ced7d371752 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 May 2024 16:02:22 +0800
Subject: [PATCH 28/37] hugetlb: set hugetlb page flag before optimizing
 vmemmap

mainline inclusion
from mainline-v6.7-rc1
commit d8f5f7e445f02eb10dee1a0a992146314cf460f8
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8f5f7e445f02eb10dee1a0a992146314cf460f8

--------------------------------

Currently, vmemmap optimization of hugetlb pages is performed before the
hugetlb flag (previously hugetlb destructor) is set identifying it as a
hugetlb folio.  This means there is a window of time where an ordinary
folio does not have all associated vmemmap present.  The core mm only
expects vmemmap to be potentially optimized for hugetlb and device dax.
This can cause problems in code such as memory error handling that may
want to write to tail struct pages.

There is only one call to perform hugetlb vmemmap optimization today.  To
fix this issue, simply set the hugetlb flag before that call.

There was a similar issue in the free hugetlb path that was previously
addressed.  The two routines that optimize or restore hugetlb vmemmap
should only be passed hugetlb folios/pages.  To catch any callers not
following this rule, add VM_WARN_ON calls to the routines.  In the hugetlb
free code paths, some calls could be made to restore vmemmap after
clearing the hugetlb flag.  This was 'safe' as in these cases vmemmap was
already present and the call was a NOOP.  However, for consistency these
calls where eliminated so that we can add the VM_WARN_ON checks.

Link: https://lkml.kernel.org/r/20230829213734.69673-1-mike.kravetz@oracle.com
Fixes: f41f2ed43ca5 ("mm: hugetlb: free the vmemmap pages associated with each HugeTLB page")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Conflicts:
	mm/hugetlb.c
	mm/hugetlb_vmemmap.c
[ Conflicts with commit 738fe30dc41a since replace folio_set_hugetlb with
  __folio_set_hugetlb in mm/hugetlb.c. Context conflicts with commit
  cf082a0e7d7c in mm/hugetlb_vmemmap.c. ]
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/hugetlb.c         | 31 ++++++++++++++++++++++---------
 mm/hugetlb_vmemmap.c |  3 +++
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f90d0845c43..270ee50a9b5a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1761,7 +1761,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
 	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
 		return;
 
-	if (hugetlb_vmemmap_restore(h, &folio->page)) {
+	/*
+	 * If folio is not vmemmap optimized (!clear_dtor), then the folio
+	 * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore
+	 * can only be passed hugetlb pages and will BUG otherwise.
+	 */
+	if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
 		 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1979,9 +1984,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
 {
+	__folio_set_hugetlb(folio);
 	hugetlb_vmemmap_optimize(h, &folio->page);
 	INIT_LIST_HEAD(&folio->lru);
-	__folio_set_hugetlb(folio);
 	hugetlb_set_folio_subpool(folio, NULL);
 	set_hugetlb_cgroup(folio, NULL);
 	set_hugetlb_cgroup_rsvd(folio, NULL);
@@ -3786,13 +3791,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
 	remove_hugetlb_folio_for_demote(h, folio, false);
 	spin_unlock_irq(&hugetlb_lock);
 
-	rc = hugetlb_vmemmap_restore(h, &folio->page);
-	if (rc) {
-		/* Allocation of vmemmmap failed, we can not demote folio */
-		spin_lock_irq(&hugetlb_lock);
-		folio_ref_unfreeze(folio, 1);
-		add_hugetlb_folio(h, folio, false);
-		return rc;
+	/*
+	 * If vmemmap already existed for folio, the remove routine above would
+	 * have cleared the hugetlb folio flag.  Hence the folio is technically
+	 * no longer a hugetlb folio.  hugetlb_vmemmap_restore can only be
+	 * passed hugetlb folios and will BUG otherwise.
+	 */
+	if (folio_test_hugetlb(folio)) {
+		rc = hugetlb_vmemmap_restore(h, &folio->page);
+		if (rc) {
+			/* Allocation of vmemmmap failed, we can not demote folio */
+			spin_lock_irq(&hugetlb_lock);
+			folio_ref_unfreeze(folio, 1);
+			add_hugetlb_folio(h, folio, false);
+			return rc;
+		}
 	}
 
 	/*
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index fb0b05d4659a..149ab629855c 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -14,6 +14,7 @@
 #include <linux/moduleparam.h>
 #include <linux/bootmem_info.h>
 #include <linux/dynamic_pool.h>
+#include <linux/mmdebug.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
@@ -487,6 +488,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 	unsigned long vmemmap_reuse;
 
+	VM_WARN_ON_ONCE(!PageHuge(head));
 	if (!HPageVmemmapOptimized(head))
 		return 0;
 
@@ -584,6 +586,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 	unsigned long vmemmap_reuse;
 
+	VM_WARN_ON_ONCE(!PageHuge(head));
 	if (!vmemmap_should_optimize(h, head))
 		return;
 
-- 
Gitee


From 9966b579b1ebed9655edfa6c74e788357e86167e Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 22 May 2024 16:02:23 +0800
Subject: [PATCH 29/37] hugetlb: check for hugetlb folio before vmemmap_restore

mainline inclusion
from mainline-v6.7-rc1
commit 30a89adf872d2e46323840964c95dc0ae3bb5843
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=30a89adf872d2e46323840964c95dc0ae3bb5843

--------------------------------

In commit d8f5f7e445f0 ("hugetlb: set hugetlb page flag before
optimizing vmemmap") checks were added to print a warning if
hugetlb_vmemmap_restore was called on a non-hugetlb page.

This was mostly due to ordering issues in the hugetlb page set up and tear
down sequencees.  One place missed was the routine
dissolve_free_huge_page.

Naoya Horiguchi noted: "I saw that VM_WARN_ON_ONCE() in
hugetlb_vmemmap_restore is triggered when memory_failure() is called on a
free hugetlb page with vmemmap optimization disabled (the warning is not
triggered if vmemmap optimization is enabled).  I think that we need check
folio_test_hugetlb() before dissolve_free_huge_page() calls
hugetlb_vmemmap_restore_folio()."

Perform the check as suggested by Naoya.

Link: https://lkml.kernel.org/r/20231017032140.GA3680@monkey
Fixes: d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Tested-by: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/hugetlb.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 270ee50a9b5a..4c48bbe39407 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2411,17 +2411,23 @@ int dissolve_free_huge_page(struct page *page)
 		 * need to adjust max_huge_pages if the page is not freed.
 		 * Attempt to allocate vmemmmap here so that we can take
 		 * appropriate action on failure.
+		 *
+		 * The folio_test_hugetlb check here is because
+		 * remove_hugetlb_folio will clear hugetlb folio flag for
+		 * non-vmemmap optimized hugetlb folios.
 		 */
-		rc = hugetlb_vmemmap_restore(h, &folio->page);
-		if (!rc) {
-			update_and_free_hugetlb_folio(h, folio, false);
-		} else {
-			spin_lock_irq(&hugetlb_lock);
-			add_hugetlb_folio(h, folio, false);
-			h->max_huge_pages++;
-			spin_unlock_irq(&hugetlb_lock);
-		}
+		if (folio_test_hugetlb(folio)) {
+			rc = hugetlb_vmemmap_restore(h, &folio->page);
+			if (rc) {
+				spin_lock_irq(&hugetlb_lock);
+				add_hugetlb_folio(h, folio, false);
+				h->max_huge_pages++;
+				goto out;
+			}
+		} else
+			rc = 0;
 
+		update_and_free_hugetlb_folio(h, folio, false);
 		return rc;
 	}
 out:
-- 
Gitee


From 7d433815ccfea83468431c0f4f1b6941107b0348 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 22 May 2024 16:02:24 +0800
Subject: [PATCH 30/37] mm/hugetlb: fix DEBUG_LOCKS_WARN_ON(1) when
 dissolve_free_hugetlb_folio()

mainline inclusion
from mainline-v6.9-rc6
commit 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=52ccdde16b6540abe43b6f8d8e1e1ec90b0983af

--------------------------------

When I did memory failure tests recently, below warning occurs:

DEBUG_LOCKS_WARN_ON(1)
WARNING: CPU: 8 PID: 1011 at kernel/locking/lockdep.c:232 __lock_acquire+0xccb/0x1ca0
Modules linked in: mce_inject hwpoison_inject
CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
RIP: 0010:__lock_acquire+0xccb/0x1ca0
RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
FS:  00007ff9f32aa740(0000) GS:ffffa1ce5fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff9f3134ba0 CR3: 00000008484e4000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 lock_acquire+0xbe/0x2d0
 _raw_spin_lock_irqsave+0x3a/0x60
 hugepage_subpool_put_pages.part.0+0xe/0xc0
 free_huge_folio+0x253/0x3f0
 dissolve_free_huge_page+0x147/0x210
 __page_handle_poison+0x9/0x70
 memory_failure+0x4e6/0x8c0
 hard_offline_page_store+0x55/0xa0
 kernfs_fop_write_iter+0x12c/0x1d0
 vfs_write+0x380/0x540
 ksys_write+0x64/0xe0
 do_syscall_64+0xbc/0x1d0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff9f3114887
RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
 </TASK>
Kernel panic - not syncing: kernel: panic_on_warn set ...
CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
Call Trace:
 <TASK>
 panic+0x326/0x350
 check_panic_on_warn+0x4f/0x50
 __warn+0x98/0x190
 report_bug+0x18e/0x1a0
 handle_bug+0x3d/0x70
 exc_invalid_op+0x18/0x70
 asm_exc_invalid_op+0x1a/0x20
RIP: 0010:__lock_acquire+0xccb/0x1ca0
RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
 lock_acquire+0xbe/0x2d0
 _raw_spin_lock_irqsave+0x3a/0x60
 hugepage_subpool_put_pages.part.0+0xe/0xc0
 free_huge_folio+0x253/0x3f0
 dissolve_free_huge_page+0x147/0x210
 __page_handle_poison+0x9/0x70
 memory_failure+0x4e6/0x8c0
 hard_offline_page_store+0x55/0xa0
 kernfs_fop_write_iter+0x12c/0x1d0
 vfs_write+0x380/0x540
 ksys_write+0x64/0xe0
 do_syscall_64+0xbc/0x1d0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff9f3114887
RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
 </TASK>

After git bisecting and digging into the code, I believe the root cause is
that _deferred_list field of folio is unioned with _hugetlb_subpool field.
In __update_and_free_hugetlb_folio(), folio->_deferred_list is
initialized leading to corrupted folio->_hugetlb_subpool when folio is
hugetlb.  Later free_huge_folio() will use _hugetlb_subpool and above
warning happens.

But it is assumed hugetlb flag must have been cleared when calling
folio_put() in update_and_free_hugetlb_folio().  This assumption is broken
due to below race:

CPU1					CPU2
dissolve_free_huge_page			update_and_free_pages_bulk
 update_and_free_hugetlb_folio		 hugetlb_vmemmap_restore_folios
					  folio_clear_hugetlb_vmemmap_optimized
  clear_flag = folio_test_hugetlb_vmemmap_optimized
  if (clear_flag) <-- False, it's already cleared.
   __folio_clear_hugetlb(folio) <-- Hugetlb is not cleared.
  folio_put
   free_huge_folio <-- free_the_page is expected.
					 list_for_each_entry()
					  __folio_clear_hugetlb <-- Too late.

Fix this issue by checking whether folio is hugetlb directly instead of
checking clear_flag to close the race window.

Link: https://lkml.kernel.org/r/20240419085819.1901645-1-linmiaohe@huawei.com
Fixes: 32c877191e02 ("hugetlb: do not clear hugetlb dtor until allocating vmemmap")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c48bbe39407..ac288f7515c0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1789,7 +1789,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
 	 * If vmemmap pages were allocated above, then we need to clear the
 	 * hugetlb destructor under the hugetlb lock.
 	 */
-	if (clear_dtor) {
+	if (folio_test_hugetlb(folio)) {
 		spin_lock_irq(&hugetlb_lock);
 		__clear_hugetlb_destructor(h, folio);
 		spin_unlock_irq(&hugetlb_lock);
-- 
Gitee


From 9e4e4615dc0a77a2b77a1186d08b96700a23b4a0 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Wed, 22 May 2024 16:02:25 +0800
Subject: [PATCH 31/37] mm: page_alloc: control latency caused by zone PCP
 draining

mainline inclusion
from mainline-v6.10-rc1
commit 55f77df7d715110299f12c27f4365bd6332d1adb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55f77df7d715110299f12c27f4365bd6332d1adb

--------------------------------

Patch series "mm/treewide: Remove pXd_huge() API", v2.

In previous work [1], we removed the pXd_large() API, which is arch
specific.  This patchset further removes the hugetlb pXd_huge() API.

Hugetlb was never special on creating huge mappings when compared with
other huge mappings.  Having a standalone API just to detect such pgtable
entries is more or less redundant, especially after the pXd_leaf() API set
is introduced with/without CONFIG_HUGETLB_PAGE.

When looking at this problem, a few issues are also exposed that we don't
have a clear definition of the *_huge() variance API.  This patchset
started by cleaning these issues first, then replace all *_huge() users to
use *_leaf(), then drop all *_huge() code.

On x86/sparc, swap entries will be reported "true" in pXd_huge(), while
for all the rest archs they're reported "false" instead.  This part is
done in patch 1-5, in which I suspect patch 1 can be seen as a bug fix,
but I'll leave that to hmm experts to decide.

Besides, there are three archs (arm, arm64, powerpc) that have slightly
different definitions between the *_huge() v.s.  *_leaf() variances.  I
tackled them separately so that it'll be easier for arch experts to chim
in when necessary.  This part is done in patch 6-9.

The final patches 10-14 do the rest on the final removal, since *_leaf()
will be the ultimate API in the future, and we seem to have quite some
confusions on how *_huge() APIs can be defined, provide a rich comment for
*_leaf() API set to define them properly to avoid future misuse, and
hopefully that'll also help new archs to start support huge mappings and
avoid traps (like either swap entries, or PROT_NONE entry checks).

[1] https://lore.kernel.org/r/20240305043750.93762-1-peterx@redhat.com

This patch (of 14):

When the complete PCP is drained a much larger number of pages than the
usual batch size might be freed at once, causing large IRQ and preemption
latency spikes, as they are all freed while holding the pcp and zone
spinlocks.

To avoid those latency spikes, limit the number of pages freed in a single
bulk operation to common batch limits.

Link: https://lkml.kernel.org/r/20240318200404.448346-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20240318200736.2835502-1-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bjorn Andersson <andersson@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konrad Dybcio <konrad.dybcio@linaro.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/page_alloc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f450bc4a4637..2225e08d1c8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2228,12 +2228,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	struct per_cpu_pages *pcp;
+	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+	int count = READ_ONCE(pcp->count);
+
+	while (count) {
+		int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
+		count -= to_drain;
 
-	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count) {
 		spin_lock(&pcp->lock);
-		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		free_pcppages_bulk(zone, to_drain, pcp, 0);
 		spin_unlock(&pcp->lock);
 	}
 }
-- 
Gitee


From 2835d46054bd352d1cfbac90245487683e9c1caf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:26 +0800
Subject: [PATCH 32/37] mm: always initialise folio->_deferred_list

mainline inclusion
from mainline-v6.10-rc1
commit b7b098cf00a2b65d5654a86dc8edf82f125289c1
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b7b098cf00a2b65d5654a86dc8edf82f125289c1

--------------------------------

Patch series "Various significant MM patches".

These patches all interact in annoying ways which make it tricky to send
them out in any way other than a big batch, even though there's not really
an overarching theme to connect them.

The big effects of this patch series are:

 - folio_test_hugetlb() becomes reliable, even when called without a
   page reference
 - We free up PG_slab, and we could always use more page flags
 - We no longer need to check PageSlab before calling page_mapcount()

This patch (of 9):

For compound pages which are at least order-2 (and hence have a
deferred_list), initialise it and then we can check at free that the page
is not part of a deferred list.  We recently found this useful to rule out
a source of corruption.

[peterx@redhat.com: always initialise folio->_deferred_list]
  Link: https://lkml.kernel.org/r/20240417211836.2742593-2-peterx@redhat.com
Link: https://lkml.kernel.org/r/20240321142448.1645400-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240321142448.1645400-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/huge_memory.c | 2 --
 mm/hugetlb.c     | 3 ++-
 mm/internal.h    | 2 ++
 mm/memcontrol.c  | 3 +++
 mm/page_alloc.c  | 9 +++++----
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 763bb25e4f99..04ca43ce7a36 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -952,8 +952,6 @@ void folio_prep_large_rmappable(struct folio *folio)
 {
 	if (!folio || !folio_test_large(folio))
 		return;
-	if (folio_order(folio) > 1)
-		INIT_LIST_HEAD(&folio->_deferred_list);
 	folio_set_large_rmappable(folio);
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac288f7515c0..7cfe80bb2cb7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1804,7 +1804,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
 		destroy_compound_gigantic_folio(folio, huge_page_order(h));
 		free_gigantic_folio(folio, huge_page_order(h));
 	} else {
-		__free_pages(&folio->page, huge_page_order(h));
+		INIT_LIST_HEAD(&folio->_deferred_list);
+		folio_put(folio);
 	}
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 6983493b997e..65e06f06d26b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -610,6 +610,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
 	atomic_set(&folio->_entire_mapcount, -1);
 	atomic_set(&folio->_nr_pages_mapped, 0);
 	atomic_set(&folio->_pincount, 0);
+	if (order > 1)
+		INIT_LIST_HEAD(&folio->_deferred_list);
 }
 
 static inline void prep_compound_tail(struct page *head, int tail_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9951efb06d7b..f903714eacb1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -8518,6 +8518,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 	struct obj_cgroup *objcg;
 
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
+			!folio_test_hugetlb(folio) &&
+			!list_empty(&folio->_deferred_list), folio);
 
 	/*
 	 * Nobody should be changing or seriously looking at
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2225e08d1c8e..5adb0fd95131 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -986,10 +986,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 		}
 		break;
 	case 2:
-		/*
-		 * the second tail page: ->mapping is
-		 * deferred_list.next -- ignore value.
-		 */
+		/* the second tail page: deferred_list overlaps ->mapping */
+		if (unlikely(!list_empty(&folio->_deferred_list))) {
+			bad_page(page, "on deferred list");
+			goto out;
+		}
 		break;
 	default:
 		if (page->mapping != TAIL_MAPPING) {
-- 
Gitee


From 97824705ab0536c0506ed1fd4953e623613eb49d Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Wed, 22 May 2024 16:02:27 +0800
Subject: [PATCH 33/37] mm/memory: change vmf_anon_prepare() to be non-static

mainline inclusion
from mainline-v6.9-rc1
commit 997f0ecb11da15602a3d34e10f9ca8418db794d0
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=997f0ecb11da15602a3d34e10f9ca8418db794d0

--------------------------------

Patch series "Handle hugetlb faults under the VMA lock", v2.

It is generally safe to handle hugetlb faults under the VMA lock.  The
only time this is unsafe is when no anon_vma has been allocated to this
vma yet, so we can use vmf_anon_prepare() instead of anon_vma_prepare() to
bailout if necessary.  This should only happen for the first hugetlb page
in the vma.

Additionally, this patchset begins to use struct vm_fault within
hugetlb_fault().  This works towards cleaning up hugetlb code, and should
significantly reduce the number of arguments passed to functions.

The last patch in this series may cause ltp hugemmap10 to "fail".  This is
because vmf_anon_prepare() may bailout with no anon_vma under the VMA lock
after allocating a folio for the hugepage.  In free_huge_folio(), this
folio is completely freed on bailout iff there is a surplus of hugetlb
pages.  This will remove a folio off the freelist and decrement the number
of hugepages while ltp expects these counters to remain unchanged on
failure.  The rest of the ltp testcases pass.

This patch (of 2):

In order to handle hugetlb faults under the VMA lock, hugetlb can use
vmf_anon_prepare() to ensure we can safely prepare an anon_vma.  Change it
to be a non-static function so it can be used within hugetlb as well.

Link: https://lkml.kernel.org/r/20240221234732.187629-6-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20240221234732.187629-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/internal.h | 1 +
 mm/memory.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/internal.h b/mm/internal.h
index 65e06f06d26b..0ecbaa392054 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -282,6 +282,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat)
 		wake_up(wqh);
 }
 
+vm_fault_t vmf_anon_prepare(struct vm_fault *vmf);
 vm_fault_t do_swap_page(struct vm_fault *vmf);
 void folio_rotate_reclaimable(struct folio *folio);
 bool __folio_end_writeback(struct folio *folio);
diff --git a/mm/memory.c b/mm/memory.c
index ed275401e695..776f59299900 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3215,7 +3215,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
 	return VM_FAULT_RETRY;
 }
 
-static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 
-- 
Gitee


From be48be9e1c56d6791a78f6ecddf3a104ccd38ca2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:28 +0800
Subject: [PATCH 34/37] mm: assert the mmap_lock is held in
 __anon_vma_prepare()

mainline inclusion
from mainline-v6.10-rc1
commit 3be51060599ff01899b6d8c3f8aca456506cf5ea
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3be51060599ff01899b6d8c3f8aca456506cf5ea

--------------------------------

Patch series "Improve anon_vma scalability for anon VMAs".

We have a 3x throughput improvement reported by Intel's kernel test robot:
https://lore.kernel.org/all/202404261055.c5e24608-oliver.sang@intel.com/

This is from delaying taking the mmap_lock for page faults until we
actually need the mmap_lock in order to assign an anon_vma to the vma.  It
cleans up the page fault path a little by making the anon fault handler
more similar to the file fault handler.

This patch (of 4):

Convert the comment into an assertion.

Link: https://lkml.kernel.org/r/20240426144506.1290619-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240426144506.1290619-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/rmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 88345e743c4f..27f8881be2ad 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
  * for the new allocation. At the same time, we do not want
  * to do any locking for the common case of already having
  * an anon_vma.
- *
- * This must be called with the mmap_lock held for reading.
  */
 int __anon_vma_prepare(struct vm_area_struct *vma)
 {
@@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
 	struct anon_vma *anon_vma, *allocated;
 	struct anon_vma_chain *avc;
 
+	mmap_assert_locked(mm);
 	might_sleep();
 
 	avc = anon_vma_chain_alloc(GFP_KERNEL);
-- 
Gitee


From 098c90dcd2b7cf56ca1e190b14508d2c2ee3584a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:29 +0800
Subject: [PATCH 35/37] mm: delay the check for a NULL anon_vma

mainline inclusion
from mainline-v6.10-rc1
commit a373baed5a9dca65a4d9fa55e61800a18c9936f1
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a373baed5a9dca65a4d9fa55e61800a18c9936f1

--------------------------------

Instead of checking the anon_vma early in the fault path where all page
faults pay the cost, delay it until we know we're going to need the
anon_vma to be filled in.  This will have a slight negative effect on the
first fault in an anonymous VMA, but it shortens every other page fault.
It also makes the code slightly cleaner as the anon and file backed fault
handling look more similar.

The Intel kernel test bot reports a 3x improvement in vm-scalability
throughput with the small-allocs-mt test.  This is clearly an extreme
situation that won't be replicated in any real-world workload, but it's a
nice win.

https://lore.kernel.org/all/202404261055.c5e24608-oliver.sang@intel.com/

Link: https://lkml.kernel.org/r/20240426144506.1290619-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/huge_memory.c |  6 ++++--
 mm/memory.c      | 29 ++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 04ca43ce7a36..18ec0e138b03 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1225,11 +1225,13 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	gfp_t gfp;
 	struct folio *folio;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	vm_fault_t ret;
 
 	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
 		return VM_FAULT_FALLBACK;
-	if (unlikely(anon_vma_prepare(vma)))
-		return VM_FAULT_OOM;
+	ret = vmf_anon_prepare(vmf);
+	if (ret)
+		return ret;
 	khugepaged_enter_vma(vma, vma->vm_flags);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
diff --git a/mm/memory.c b/mm/memory.c
index 776f59299900..0a3e0f680f90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3215,6 +3215,21 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
 	return VM_FAULT_RETRY;
 }
 
+/**
+ * vmf_anon_prepare - Prepare to handle an anonymous fault.
+ * @vmf: The vm_fault descriptor passed from the fault handler.
+ *
+ * When preparing to insert an anonymous page into a VMA from a
+ * fault handler, call this function rather than anon_vma_prepare().
+ * If this vma does not already have an associated anon_vma and we are
+ * only protected by the per-VMA lock, the caller must retry with the
+ * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
+ * determine if this VMA can share its anon_vma, and that's not safe to
+ * do with only the per-VMA lock held for this VMA.
+ *
+ * Return: 0 if fault handling can proceed.  Any other value should be
+ * returned to the caller.
+ */
 vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -4418,8 +4433,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	}
 
 	/* Allocate our own private page. */
-	if (unlikely(anon_vma_prepare(vma)))
-		goto oom;
+	ret = vmf_anon_prepare(vmf);
+	if (ret)
+		return ret;
 	/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
 	folio = alloc_anon_folio(vmf);
 	if (IS_ERR(folio))
@@ -5803,15 +5819,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	if (!vma_start_read(vma))
 		goto inval;
 
-	/*
-	 * find_mergeable_anon_vma uses adjacent vmas which are not locked.
-	 * This check must happen after vma_start_read(); otherwise, a
-	 * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
-	 * from its anon_vma.
-	 */
-	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
-		goto inval_end_read;
-
 	/* Check since vm_start/vm_end might change before we lock the VMA */
 	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
 		goto inval_end_read;
-- 
Gitee


From 56379aed5215336e548fbb6112967d2b03befa37 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 22 May 2024 16:02:30 +0800
Subject: [PATCH 36/37] mm: optimise vmf_anon_prepare() for VMAs without an
 anon_vma

mainline inclusion
from mainline-v6.10-rc1
commit 737019cf6ac5babb75645ad324aeead7bc04749d
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=737019cf6ac5babb75645ad324aeead7bc04749d

--------------------------------

If the mmap_lock can be taken for read, we can call __anon_vma_prepare()
while holding it, saving ourselves a trip back through the fault handler.

Link: https://lkml.kernel.org/r/20240426144506.1290619-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/memory.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0a3e0f680f90..fa4d1b499511 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3233,16 +3233,21 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
 vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = 0;
 
 	if (likely(vma->anon_vma))
 		return 0;
 	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-		vma_end_read(vma);
-		return VM_FAULT_RETRY;
+		if (!mmap_read_trylock(vma->vm_mm)) {
+			vma_end_read(vma);
+			return VM_FAULT_RETRY;
+		}
 	}
 	if (__anon_vma_prepare(vma))
-		return VM_FAULT_OOM;
-	return 0;
+		ret = VM_FAULT_OOM;
+	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+		mmap_read_unlock(vma->vm_mm);
+	return ret;
 }
 
 /*
-- 
Gitee


From 3616bd3a540490276028188e6e8229bd7b2febf4 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 22 May 2024 16:02:31 +0800
Subject: [PATCH 37/37] mm/dynamic_pool: clear PG_hugetlb when demote hugepages

hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9R3AY
CVE: NA

--------------------------------

Before commit 738fe30dc41a, the PG_hugetlb is recorded in page->flags,
and will be cleared by clear_compound_page(). Now since PG_hugetlb turned
to record into page_type, we have to clear it by __folio_clear_hugetlb().

Fixes: 738fe30dc41a ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 mm/dynamic_pool.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/dynamic_pool.c b/mm/dynamic_pool.c
index b1590362c2c9..41a627431ea6 100644
--- a/mm/dynamic_pool.c
+++ b/mm/dynamic_pool.c
@@ -269,6 +269,7 @@ static int dpool_demote_huge_page(struct pages_pool *src_pool,
 	__ClearPageDpool(page);
 	src_pool->free_pages--;
 
+	__folio_clear_hugetlb(page_folio(page));
 	clear_compound_page(page_folio(page), PMD_ORDER);
 	for (i = 0; i < nr_pages; i++) {
 		subpage = folio_page(folio, i);
-- 
Gitee