From c9ed1c54d009e5ec8c250beee95ad0bbb74dff3d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:40 -0700
Subject: [PATCH 01/11] mm/filemap: use page_cache_sync_ra() to kick off
 read-ahead

ANBZ: #27274

commit f598cdaafc370a797ae883d370a7c18c1ffc43ef upstream.

Rather than use the page_cache_sync_readahead() helper, define our own
ractl and use page_cache_sync_ra() directly.  In preparation for needing
to modify ractl inside filemap_get_pages().

No functional changes in this patch.

[Backport Note]
In generic_file_buffered_read_get_pages(), we explicitly define our own
ractl structure in both synchronous and asynchronous readahead, which
ensures that subsequent logic can flexibly adjust or modify ractl as needed.

Link: https://lkml.kernel.org/r/20241220154831.1086649-3-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/filemap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index fb6f25462df6..d4783139937b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2401,7 +2401,6 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 {
 	struct file *filp = iocb->ki_filp;
 	struct address_space *mapping = filp->f_mapping;
-	struct file_ra_state *ra = &filp->f_ra;
 	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 	pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
 	int i, j, nr_got, err = 0;
@@ -2418,7 +2417,8 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_NOIO)
 		return -EAGAIN;
 
-	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+	DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
+	page_cache_sync_ra(&ractl, last_index - index);
 
 	nr_got = find_get_pages_contig(mapping, index, nr, pages);
 	if (nr_got)
@@ -2444,8 +2444,8 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 				err = -EAGAIN;
 				break;
 			}
-			page_cache_async_readahead(mapping, ra, filp, page,
-					pg_index, last_index - pg_index);
+			DEFINE_READAHEAD(ractl_async, filp, &filp->f_ra, mapping, pg_index);
+			page_cache_async_ra(&ractl_async, page, last_index - pg_index);
 		}
 
 		if (!PageUptodate(page)) {
-- 
Gitee


From fb347a213e3d2d4a94f305b4f0c6ffd8a29ca742 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:41 -0700
Subject: [PATCH 02/11] mm/readahead: add folio allocation helper

ANBZ: #27274

commit 1963de79d3a3bc12b7a17a922d508b733ca8fa9e upstream.

Just a wrapper around filemap_alloc_folio() for now, but add it in
preparation for modifying the folio based on the 'ractl' being passed in.

No functional changes in this patch.

[Backport Note]
Use ractl_alloc_page to wrap __page_cache_alloc in preparation for
modifying the folio based on the 'ractl' being passed in.

Link: https://lkml.kernel.org/r/20241220154831.1086649-4-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/readahead.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index f5fa99d71a12..240c45fbf8fe 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -161,6 +161,12 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
 		rac->_index++;
 }
 
+static struct page *ractl_alloc_page(struct readahead_control *ractl,
+				       gfp_t gfp_mask)
+{
+	return __page_cache_alloc(gfp_mask);
+}
+
 /**
  * page_cache_ra_unbounded - Start unchecked readahead.
  * @ractl: Readahead control.
@@ -217,7 +223,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
 			continue;
 		}
 
-		page = __page_cache_alloc(gfp_mask);
+		page = ractl_alloc_page(ractl, gfp_mask);
 		if (!page)
 			break;
 		if (mapping->a_ops->readpages) {
@@ -694,7 +700,7 @@ void readahead_expand(struct readahead_control *ractl,
 		if (page && !xa_is_value(page))
 			return; /* Page apparently present */
 
-		page = __page_cache_alloc(gfp_mask);
+		page = ractl_alloc_page(ractl, gfp_mask);
 		if (!page)
 			return;
 		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
@@ -717,7 +723,7 @@ void readahead_expand(struct readahead_control *ractl,
 		if (page && !xa_is_value(page))
 			return; /* Page apparently present */
 
-		page = __page_cache_alloc(gfp_mask);
+		page = ractl_alloc_page(ractl, gfp_mask);
 		if (!page)
 			return;
 		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
-- 
Gitee


From 22b8366de0a257d471c9311abc7d81f481a3382a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:42 -0700
Subject: [PATCH 03/11] mm: add PG_dropbehind folio flag

ANBZ: #27274

commit cceba6f7e46c48deca433030d80fc34599fb9fd8 upstream.

Add a folio flag that file IO can use to indicate that the cached IO being
done should be dropped from the page cache upon completion.

Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 include/linux/page-flags.h     | 5 +++++
 include/trace/events/mmflags.h | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d2022ec30f99..7e9a5c5be7f2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -124,6 +124,7 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
+	PG_dropbehind,		/* drop pages on IO completion */
 #ifdef CONFIG_MMU
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
@@ -483,6 +484,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 	TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
+PAGEFLAG(Dropbehind, dropbehind, PF_NO_COMPOUND)
+	TESTCLEARFLAG(Dropbehind, dropbehind, PF_HEAD)
+	__SETPAGEFLAG(Dropbehind, dropbehind, PF_HEAD)
+
 #ifdef CONFIG_HIGHMEM
 /*
  * Must use a macro here due to header dependency issues. page_zone() is not
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 8b93b136e101..b6c0043bcee9 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -120,7 +120,8 @@
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},		\
 	{1UL << PG_reclaim,		"reclaim"	},		\
 	{1UL << PG_swapbacked,		"swapbacked"	},		\
-	{1UL << PG_unevictable,		"unevictable"	}		\
+	{1UL << PG_unevictable,		"unevictable"	},		\
+	{1UL << PG_dropbehind,		"dropbehind"	}		\
 IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
 IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
-- 
Gitee


From cdb4b763273e9a5de9ca408dfa68104889d9c6fb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:43 -0700
Subject: [PATCH 04/11] mm/readahead: add readahead_control->dropbehind member

ANBZ: #27274

commit 77d075221ae777296e2b18a0a4f5fea6f75daf2c upstream.

If ractl->dropbehind is set to true, then folios created are marked as
dropbehind as well.

Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 include/linux/pagemap.h | 1 +
 mm/readahead.c          | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 46ea8cb4110b..d9fa8811e9ec 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -841,6 +841,7 @@ struct readahead_control {
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
+	bool dropbehind;
 
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
diff --git a/mm/readahead.c b/mm/readahead.c
index 240c45fbf8fe..ebf2f4fe0e22 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -164,7 +164,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
 static struct page *ractl_alloc_page(struct readahead_control *ractl,
 				       gfp_t gfp_mask)
 {
-	return __page_cache_alloc(gfp_mask);
+	struct page *page;
+
+	page = __page_cache_alloc(gfp_mask);
+	if (page && ractl->dropbehind)
+		__SetPageDropbehind(page);
+
+	return page;
 }
 
 /**
-- 
Gitee


From 59dcb5c767af5713a94332dcd61c5d2c5856af35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:44 -0700
Subject: [PATCH 05/11] mm/truncate: add folio_unmap_invalidate() helper

ANBZ: #27274

commit 4a9e23159fd37677efc0c2c53e3b45a5d260a90a upstream.

Add a folio_unmap_invalidate() helper, which unmaps and invalidates a
given folio.  The caller must already have locked the folio.  Embed the
old invalidate_complete_folio2() helper in there as well, as nobody else
calls it.

Use this new helper in invalidate_inode_pages2_range(), rather than
duplicate the code there.

In preparation for using this elsewhere as well, have it take a gfp_t mask
rather than assume GFP_KERNEL is the right choice.  This bubbles back to
invalidate_complete_folio2() as well.

[Backport Note]
In 5.10, we add page_unmap_invalidate() helper.

Link: https://lkml.kernel.org/r/20241220154831.1086649-7-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/internal.h |  2 ++
 mm/truncate.c | 50 +++++++++++++++++++++++++++-----------------------
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 85a392d6b1c3..2acb3e1d742b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,8 @@ void unmap_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
 			     unsigned long addr, unsigned long end,
 			     struct zap_details *details);
+int page_unmap_invalidate(struct address_space *mapping, struct page *page,
+			gfp_t gfp);
 
 void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
 		unsigned long lookahead_size);
diff --git a/mm/truncate.c b/mm/truncate.c
index 4b85a4d5d060..c501acde923b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -668,6 +668,15 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
 	__invalidate_mapping_pages(mapping, start, end, nr_pagevec);
 }
 
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+	if (!PageDirty(page))
+		return 0;
+	if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+		return 0;
+	return mapping->a_ops->launder_page(page);
+}
+
 /*
  * This is like invalidate_complete_page(), except it ignores the page's
  * refcount.  We do this because invalidate_inode_pages2() needs stronger
@@ -675,16 +684,28 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
  * shrink_page_list() has a temp ref on them, or because they're transiently
  * sitting in the lru_cache_add() pagevecs.
  */
-static int
-invalidate_complete_page2(struct address_space *mapping, struct page *page)
+int page_unmap_invalidate(struct address_space *mapping, struct page *page,
+			  gfp_t gfp)
 {
 	unsigned long flags;
+	int ret;
 
-	if (page->mapping != mapping)
-		return 0;
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+	if (PageDirty(page))
 		return 0;
+	if (page_mapped(page))
+		unmap_mapping_page(page);
+	BUG_ON(page_mapped(page));
+
+	ret = do_launder_page(mapping, page);
+	if (ret)
+		return ret;
+	if (page->mapping != mapping)
+		return -EBUSY;
+
+	if (page_has_private(page) && !try_to_release_page(page, gfp))
+		return -EBUSY;
 
 	xa_lock_irqsave(&mapping->i_pages, flags);
 	if (PageDirty(page))
@@ -712,15 +733,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	return 0;
 }
 
-static int do_launder_page(struct address_space *mapping, struct page *page)
-{
-	if (!PageDirty(page))
-		return 0;
-	if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
-		return 0;
-	return mapping->a_ops->launder_page(page);
-}
-
 /**
  * invalidate_inode_pages2_range - remove range of pages from an address_space
  * @mapping: the address_space
@@ -784,15 +796,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 			}
 			wait_on_page_writeback(page);
 
-			if (page_mapped(page))
-				unmap_mapping_page(page);
-			BUG_ON(page_mapped(page));
-
-			ret2 = do_launder_page(mapping, page);
-			if (ret2 == 0) {
-				if (!invalidate_complete_page2(mapping, page))
-					ret2 = -EBUSY;
-			}
+			ret2 = page_unmap_invalidate(mapping, page, GFP_KERNEL);
 			if (ret2 < 0)
 				ret = ret2;
 			unlock_page(page);
-- 
Gitee


From b4e6982aa64563e86948fb4cec08a11aba7ed3c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:45 -0700
Subject: [PATCH 06/11] fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE
 file_operations flag

ANBZ: #27274

commit b9f958d4f146bd11be33a5f2bc3ced50f86d6b23 upstream.

If a file system supports uncached buffered IO, it may set FOP_DONTCACHE
and enable support for RWF_DONTCACHE.  If RWF_DONTCACHE is attempted
without the file system supporting it, it'll get errored with -EOPNOTSUPP.

[Backport Note]
Now in 5.10 kernel, we don't have fop_flags in struct file_operations,
so we add it and just use FOP_DONTCACHE only. We keep mmap_supported_flags
for using.

Link: https://lkml.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 include/linux/fs.h      | 14 ++++++++++++++
 include/uapi/linux/fs.h |  5 ++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8e9d1c945dc2..bfaaa8465216 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -312,6 +312,7 @@ enum rw_hint {
 #define IOCB_SYNC		(__force int) RWF_SYNC
 #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
 #define IOCB_APPEND		(__force int) RWF_APPEND
+#define IOCB_DONTCACHE		(__force int) RWF_DONTCACHE
 
 /* non-RWF related bits - start at 16 */
 #define IOCB_EVENTFD		(1 << 16)
@@ -1964,8 +1965,11 @@ struct dir_context {
 struct iov_iter;
 struct io_uring_cmd;
 
+typedef unsigned int __bitwise fop_flags_t;
+
 struct file_operations {
 	struct module *owner;
+	fop_flags_t fop_flags;
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
@@ -2015,6 +2019,8 @@ struct file_operations {
 	CK_KABI_RESERVE(4)
 } __randomize_layout;
 
+#define FOP_DONTCACHE		((__force fop_flags_t)(1 << 7))
+
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
@@ -3462,6 +3468,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		kiocb_flags |= IOCB_NOIO;
 	}
+	if (flags & RWF_DONTCACHE) {
+		/* file system must support it */
+		if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
+			return -EOPNOTSUPP;
+		/* DAX mappings not supported */
+		if (IS_DAX(ki->ki_filp->f_mapping->host))
+			return -EOPNOTSUPP;
+	}
 	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afd..8a505280508c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -300,8 +300,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* buffered IO that drops the cache after reading or writing data */
+#define RWF_DONTCACHE	((__force __kernel_rwf_t)0x00000080)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_DONTCACHE)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
Gitee


From 0598f74a16f25173df141842e44d1026417808c4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Dec 2024 08:47:46 -0700
Subject: [PATCH 07/11] mm/filemap: add read support for RWF_DONTCACHE

ANBZ: #27274

commit 8026e49bff9b151609da4cae20e9da7f1833dde6 upstream.

Add RWF_DONTCACHE as a read operation flag, which means that any data read
wil be removed from the page cache upon completion.  Uses the page cache
to synchronize, and simply prunes folios that were instantiated when the
operation completes.  While it would be possible to use private pages for
this, using the page cache as synchronization is handy for a variety of
reasons:

1) No special truncate magic is needed
2) Async buffered reads need some place to serialize, using the page
   cache is a lot easier than writing extra code for this
3) The pruning cost is pretty reasonable

and the code to support this is much simpler as a result.

You can think of uncached buffered IO as being the much more attractive
cousin of O_DIRECT - it has none of the restrictions of O_DIRECT.  Yes, it
will copy the data, but unlike regular buffered IO, it doesn't run into
the unpredictability of the page cache in terms of reclaim.  As an
example, on a test box with 32 drives, reading them with buffered IO looks
as follows:

Reading bs 65536, uncached 0
  1s: 145945MB/sec
  2s: 158067MB/sec
  3s: 157007MB/sec
  4s: 148622MB/sec
  5s: 118824MB/sec
  6s: 70494MB/sec
  7s: 41754MB/sec
  8s: 90811MB/sec
  9s: 92204MB/sec
 10s: 95178MB/sec
 11s: 95488MB/sec
 12s: 95552MB/sec
 13s: 96275MB/sec

where it's quite easy to see where the page cache filled up, and
performance went from good to erratic, and finally settles at a much
lower rate. Looking at top while this is ongoing, we see:

 PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
7535 root      20   0  267004      0      0 S  3199   0.0   8:40.65 uncached
3326 root      20   0       0      0      0 R 100.0   0.0   0:16.40 kswapd4
3327 root      20   0       0      0      0 R 100.0   0.0   0:17.22 kswapd5
3328 root      20   0       0      0      0 R 100.0   0.0   0:13.29 kswapd6
3332 root      20   0       0      0      0 R 100.0   0.0   0:11.11 kswapd10
3339 root      20   0       0      0      0 R 100.0   0.0   0:16.25 kswapd17
3348 root      20   0       0      0      0 R 100.0   0.0   0:16.40 kswapd26
3343 root      20   0       0      0      0 R 100.0   0.0   0:16.30 kswapd21
3344 root      20   0       0      0      0 R 100.0   0.0   0:11.92 kswapd22
3349 root      20   0       0      0      0 R 100.0   0.0   0:16.28 kswapd27
3352 root      20   0       0      0      0 R  99.7   0.0   0:11.89 kswapd30
3353 root      20   0       0      0      0 R  96.7   0.0   0:16.04 kswapd31
3329 root      20   0       0      0      0 R  96.4   0.0   0:11.41 kswapd7
3345 root      20   0       0      0      0 R  96.4   0.0   0:13.40 kswapd23
3330 root      20   0       0      0      0 S  91.1   0.0   0:08.28 kswapd8
3350 root      20   0       0      0      0 S  86.8   0.0   0:11.13 kswapd28
3325 root      20   0       0      0      0 S  76.3   0.0   0:07.43 kswapd3
3341 root      20   0       0      0      0 S  74.7   0.0   0:08.85 kswapd19
3334 root      20   0       0      0      0 S  71.7   0.0   0:10.04 kswapd12
3351 root      20   0       0      0      0 R  60.5   0.0   0:09.59 kswapd29
3323 root      20   0       0      0      0 R  57.6   0.0   0:11.50 kswapd1
[...]

which is just showing a partial list of the 32 kswapd threads that are
running mostly full tilt, burning ~28 full CPU cores.

If the same test case is run with RWF_DONTCACHE set for the buffered read,
the output looks as follows:

Reading bs 65536, uncached 0
  1s: 153144MB/sec
  2s: 156760MB/sec
  3s: 158110MB/sec
  4s: 158009MB/sec
  5s: 158043MB/sec
  6s: 157638MB/sec
  7s: 157999MB/sec
  8s: 158024MB/sec
  9s: 157764MB/sec
 10s: 157477MB/sec
 11s: 157417MB/sec
 12s: 157455MB/sec
 13s: 157233MB/sec
 14s: 156692MB/sec

which is just chugging along at ~155GB/sec of read performance. Looking
at top, we see:

 PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
7961 root      20   0  267004      0      0 S  3180   0.0   5:37.95 uncached
8024 axboe     20   0   14292   4096      0 R   1.0   0.0   0:00.13 top

where just the test app is using CPU, no reclaim is taking place outside
of the main thread.  Not only is performance 65% better, it's also using
half the CPU to do it.

Link: https://lkml.kernel.org/r/20241220154831.1086649-9-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Chris Mason <clm@meta.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/filemap.c | 31 +++++++++++++++++++++++++++++--
 mm/swap.c    |  3 +++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index d4783139937b..4c316e11db8a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2383,6 +2383,8 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
 	page = page_cache_alloc(mapping);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
+	if (iocb->ki_flags & IOCB_DONTCACHE)
+		__SetPageDropbehind(page);
 
 	error = add_to_page_cache_lru(page, mapping, index,
 				      mapping_gfp_constraint(mapping, GFP_KERNEL));
@@ -2418,6 +2420,9 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 		return -EAGAIN;
 
 	DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
+	if (iocb->ki_flags & IOCB_DONTCACHE)
+		ractl.dropbehind = 1;
+
 	page_cache_sync_ra(&ractl, last_index - index);
 
 	nr_got = find_get_pages_contig(mapping, index, nr, pages);
@@ -2445,6 +2450,9 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 				break;
 			}
 			DEFINE_READAHEAD(ractl_async, filp, &filp->f_ra, mapping, pg_index);
+			if (iocb->ki_flags & IOCB_DONTCACHE)
+				ractl_async.dropbehind = 1;
+
 			page_cache_async_ra(&ractl_async, page, last_index - pg_index);
 		}
 
@@ -2487,6 +2495,21 @@ static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page)
 	return (pos1 >> shift == pos2 >> shift);
 }
 
+
+static void filemap_end_dropbehind_read(struct address_space *mapping,
+					struct page *page)
+{
+	if (!PageDropbehind(page))
+		return;
+	if (PageWriteback(page) || PageDirty(page))
+		return;
+	if (trylock_page(page)) {
+		if (TestClearPageDropbehind(page))
+			page_unmap_invalidate(mapping, page, 0);
+		unlock_page(page);
+	}
+}
+
 /**
  * generic_file_buffered_read - generic file read routine
  * @iocb:	the iocb to read
@@ -2610,8 +2633,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			}
 		}
 put_pages:
-		for (i = 0; i < pg_nr; i++)
-			put_page(pages[i]);
+		for (i = 0; i < pg_nr; i++) {
+			struct page *page = pages[i];
+
+			filemap_end_dropbehind_read(mapping, page);
+			put_page(page);
+		}
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
diff --git a/mm/swap.c b/mm/swap.c
index 299d09b2d285..86506c16d0b8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -458,6 +458,9 @@ void mark_page_accessed(struct page *page)
 {
 	page = compound_head(page);
 
+	if (PageDropbehind(page))
+		return;
+
 	if (lru_gen_enabled()) {
 		page_inc_refs(page);
 		return;
-- 
Gitee


From 0577bcd0f6969ba02f9e0885778a5cfafff4d512 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Tue, 18 Feb 2025 19:47:37 +0800
Subject: [PATCH 08/11] mm/truncate: don't skip dirty page in
 folio_unmap_invalidate()

ANBZ: #27274
ANBZ: #18912

cherry-picked from https://lore.kernel.org/all/20250218120209.88093-3-jefflexu@linux.alibaba.com/

... otherwise this is a behavior change for the previous callers of
invalidate_complete_folio2(), e.g. the page invalidation routine.

Fixes: 4a9e23159fd3 ("mm/truncate: add folio_unmap_invalidate() helper")
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4656
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/truncate.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index c501acde923b..149aa122a04d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -692,8 +692,6 @@ int page_unmap_invalidate(struct address_space *mapping, struct page *page,
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-	if (PageDirty(page))
-		return 0;
 	if (page_mapped(page))
 		unmap_mapping_page(page);
 	BUG_ON(page_mapped(page));
-- 
Gitee


From 28f06f23a9f2c917a559fe95c8f0f72a0681f56e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 4 Feb 2025 11:40:00 -0700
Subject: [PATCH 09/11] xfs: flag as supporting FOP_DONTCACHE

ANBZ: #27274

commit 974c5e6139db30fae668e44c381d13bcc63b65fa upstream.

Read side was already fully supported, and with the write side
appropriately punted to the worker queue, all that's needed now is
setting FOP_DONTCACHE in the file_operations structure to enable full
support for read and write uncached IO.

This provides similar benefits to using RWF_DONTCACHE with reads. Testing
buffered writes on 32 files:

writing bs 65536, uncached 0
  1s: 196035MB/sec
  2s: 132308MB/sec
  3s: 132438MB/sec
  4s: 116528MB/sec
  5s: 103898MB/sec
  6s: 108893MB/sec
  7s: 99678MB/sec
  8s: 106545MB/sec
  9s: 106826MB/sec
 10s: 101544MB/sec
 11s: 111044MB/sec
 12s: 124257MB/sec
 13s: 116031MB/sec
 14s: 114540MB/sec
 15s: 115011MB/sec
 16s: 115260MB/sec
 17s: 116068MB/sec
 18s: 116096MB/sec

where it's quite obvious where the page cache filled, and performance
dropped from to about half of where it started, settling in at around
115GB/sec. Meanwhile, 32 kswapds were running full steam trying to
reclaim pages.

Running the same test with uncached buffered writes:

writing bs 65536, uncached 1
  1s: 198974MB/sec
  2s: 189618MB/sec
  3s: 193601MB/sec
  4s: 188582MB/sec
  5s: 193487MB/sec
  6s: 188341MB/sec
  7s: 194325MB/sec
  8s: 188114MB/sec
  9s: 192740MB/sec
 10s: 189206MB/sec
 11s: 193442MB/sec
 12s: 189659MB/sec
 13s: 191732MB/sec
 14s: 190701MB/sec
 15s: 191789MB/sec
 16s: 191259MB/sec
 17s: 190613MB/sec
 18s: 191951MB/sec

and the behavior is fully predictable, performing the same throughout
even after the page cache would otherwise have fully filled with dirty
data. It's also about 65% faster, and using half the CPU of the system
compared to the normal buffered write.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20250204184047.356762-3-axboe@kernel.dk
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 fs/xfs/xfs_file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ba9e2fa4f1c0..e2449295baaa 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1473,6 +1473,7 @@ const struct file_operations xfs_file_operations = {
 	.fallocate	= xfs_file_fallocate,
 	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
+	.fop_flags	= FOP_DONTCACHE,
 };
 
 const struct file_operations xfs_dir_file_operations = {
-- 
Gitee


From 1d514bb5a6a70b708a66813e16ffdde90d791690 Mon Sep 17 00:00:00 2001
From: Weilin Tong <tongweilin@linux.alibaba.com>
Date: Thu, 4 Dec 2025 16:04:50 +0800
Subject: [PATCH 10/11] anolis: mm: support uncached buffered read in ext4

ANBZ: #27274

Set FOP_DONTCACHE in ext4_file_operations to declare support for
uncached buffered I/O.

Ref:
ae21c0c0ac56aa734327e9c8b7dfef4270ab54d4 upstream.

Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 fs/ext4/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 56be85edd62f..41dac77e91e7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -910,6 +910,7 @@ const struct file_operations ext4_file_operations = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
+	.fop_flags	= FOP_DONTCACHE,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
-- 
Gitee


From 0d55b79c089b4513ce2b2af8b56f69bd48de3c8f Mon Sep 17 00:00:00 2001
From: Weilin Tong <tongweilin@linux.alibaba.com>
Date: Thu, 4 Dec 2025 16:20:35 +0800
Subject: [PATCH 11/11] anolis: mm: gate dropbehind invalidate on page !dirty
 && !writeback

ANBZ: #27274

It's possible for the page to either get marked for writeback or
redirtied. Add a helper, filemap_end_dropbehind(), which guards the
page_unmap_invalidate() call behind check for the page being both
non-dirty and not under writeback AFTER the page lock has been
acquired.

Use this helper for read invalidation.

Ref:
commit 095f627add86a6ddda2c2cfd563b0ee05d0172b2 upstream.
commit 25b065a744ff0c1099bb357be1c40030b5a14c07 upstream.

Signed-off-by: Weilin Tong <tongweilin@linux.alibaba.com>
---
 mm/filemap.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 4c316e11db8a..0916eb2f0999 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1589,6 +1589,16 @@ void page_endio(struct page *page, bool is_write, int err)
 }
 EXPORT_SYMBOL_GPL(page_endio);
 
+static void filemap_end_dropbehind(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	if (mapping && !PageWriteback(page) && !PageDirty(page))
+		page_unmap_invalidate(mapping, page, 0);
+}
+
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @__page: the page to lock
@@ -2496,8 +2506,7 @@ static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page)
 }
 
 
-static void filemap_end_dropbehind_read(struct address_space *mapping,
-					struct page *page)
+static void filemap_end_dropbehind_read(struct page *page)
 {
 	if (!PageDropbehind(page))
 		return;
@@ -2505,7 +2514,7 @@ static void filemap_end_dropbehind_read(struct address_space *mapping,
 		return;
 	if (trylock_page(page)) {
 		if (TestClearPageDropbehind(page))
-			page_unmap_invalidate(mapping, page, 0);
+			filemap_end_dropbehind(page);
 		unlock_page(page);
 	}
 }
@@ -2636,7 +2645,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		for (i = 0; i < pg_nr; i++) {
 			struct page *page = pages[i];
 
-			filemap_end_dropbehind_read(mapping, page);
+			filemap_end_dropbehind_read(page);
 			put_page(page);
 		}
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
-- 
Gitee