From c9ed1c54d009e5ec8c250beee95ad0bbb74dff3d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:40 -0700 Subject: [PATCH 01/11] mm/filemap: use page_cache_sync_ra() to kick off read-ahead ANBZ: #27274 commit f598cdaafc370a797ae883d370a7c18c1ffc43ef upstream. Rather than use the page_cache_sync_readahead() helper, define our own ractl and use page_cache_sync_ra() directly. In preparation for needing to modify ractl inside filemap_get_pages(). No functional changes in this patch. [Backport Note] In generic_file_buffered_read_get_pages(), we explicitly define our own ractl structure in both synchronous and asynchronous readahead, which ensures that subsequent logic can flexibly adjust or modify ractl as needed. Link: https://lkml.kernel.org/r/20241220154831.1086649-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index fb6f25462df6..d4783139937b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2401,7 +2401,6 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; int i, j, nr_got, err = 0; @@ -2418,7 +2417,8 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + page_cache_sync_ra(&ractl, last_index - index); nr_got = find_get_pages_contig(mapping, index, nr, pages); if (nr_got) @@ -2444,8 +2444,8 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, err = -EAGAIN; break; } - page_cache_async_readahead(mapping, ra, filp, page, - pg_index, last_index - pg_index); + DEFINE_READAHEAD(ractl_async, filp, &filp->f_ra, mapping, pg_index); + page_cache_async_ra(&ractl_async, page, last_index - pg_index); } if (!PageUptodate(page)) { -- Gitee From fb347a213e3d2d4a94f305b4f0c6ffd8a29ca742 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:41 -0700 Subject: [PATCH 02/11] mm/readahead: add folio allocation helper ANBZ: #27274 commit 1963de79d3a3bc12b7a17a922d508b733ca8fa9e upstream. Just a wrapper around filemap_alloc_folio() for now, but add it in preparation for modifying the folio based on the 'ractl' being passed in. No functional changes in this patch. [Backport Note] Use ractl_alloc_page to wrap __page_cache_alloc in preparation for modifying the folio based on the 'ractl' being passed in. Link: https://lkml.kernel.org/r/20241220154831.1086649-4-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- mm/readahead.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index f5fa99d71a12..240c45fbf8fe 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -161,6 +161,12 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, rac->_index++; } +static struct page *ractl_alloc_page(struct readahead_control *ractl, + gfp_t gfp_mask) +{ + return __page_cache_alloc(gfp_mask); +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -217,7 +223,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) break; if (mapping->a_ops->readpages) { @@ -694,7 +700,7 @@ void readahead_expand(struct readahead_control *ractl, if (page && !xa_is_value(page)) return; /* Page apparently present */ - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) return; if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { @@ -717,7 +723,7 @@ void readahead_expand(struct readahead_control *ractl, if (page && !xa_is_value(page)) return; /* Page apparently present */ - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) return; if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { -- Gitee From 22b8366de0a257d471c9311abc7d81f481a3382a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:42 -0700 Subject: [PATCH 03/11] mm: add PG_dropbehind folio flag ANBZ: #27274 commit cceba6f7e46c48deca433030d80fc34599fb9fd8 upstream. Add a folio flag that file IO can use to indicate that the cached IO being done should be dropped from the page cache upon completion. Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- include/linux/page-flags.h | 5 +++++ include/trace/events/mmflags.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d2022ec30f99..7e9a5c5be7f2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -124,6 +124,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -483,6 +484,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Dropbehind, dropbehind, PF_NO_COMPOUND) + TESTCLEARFLAG(Dropbehind, dropbehind, PF_HEAD) + __SETPAGEFLAG(Dropbehind, dropbehind, PF_HEAD) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 8b93b136e101..b6c0043bcee9 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -120,7 +120,8 @@ {1UL << PG_mappedtodisk, "mappedtodisk" }, \ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ - {1UL << PG_unevictable, "unevictable" } \ + {1UL << PG_unevictable, "unevictable" }, \ + {1UL << PG_dropbehind, "dropbehind" } \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ -- Gitee From cdb4b763273e9a5de9ca408dfa68104889d9c6fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:43 -0700 Subject: [PATCH 04/11] mm/readahead: add readahead_control->dropbehind member ANBZ: #27274 commit 77d075221ae777296e2b18a0a4f5fea6f75daf2c upstream. If ractl->dropbehind is set to true, then folios created are marked as dropbehind as well. Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- include/linux/pagemap.h | 1 + mm/readahead.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 46ea8cb4110b..d9fa8811e9ec 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -841,6 +841,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) diff --git a/mm/readahead.c b/mm/readahead.c index 240c45fbf8fe..ebf2f4fe0e22 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -164,7 +164,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, static struct page *ractl_alloc_page(struct readahead_control *ractl, gfp_t gfp_mask) { - return __page_cache_alloc(gfp_mask); + struct page *page; + + page = __page_cache_alloc(gfp_mask); + if (page && ractl->dropbehind) + __SetPageDropbehind(page); + + return page; } /** -- Gitee From 59dcb5c767af5713a94332dcd61c5d2c5856af35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:44 -0700 Subject: [PATCH 05/11] mm/truncate: add folio_unmap_invalidate() helper ANBZ: #27274 commit 4a9e23159fd37677efc0c2c53e3b45a5d260a90a upstream. Add a folio_unmap_invalidate() helper, which unmaps and invalidates a given folio. The caller must already have locked the folio. Embed the old invalidate_complete_folio2() helper in there as well, as nobody else calls it. Use this new helper in invalidate_inode_pages2_range(), rather than duplicate the code there. In preparation for using this elsewhere as well, have it take a gfp_t mask rather than assume GFP_KERNEL is the right choice. This bubbles back to invalidate_complete_folio2() as well. [Backport Note] In 5.10, we add page_unmap_invalidate() helper. Link: https://lkml.kernel.org/r/20241220154831.1086649-7-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- mm/internal.h | 2 ++ mm/truncate.c | 50 +++++++++++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 85a392d6b1c3..2acb3e1d742b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -50,6 +50,8 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +int page_unmap_invalidate(struct address_space *mapping, struct page *page, + gfp_t gfp); void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/truncate.c b/mm/truncate.c index 4b85a4d5d060..c501acde923b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -668,6 +668,15 @@ void invalidate_mapping_pagevec(struct address_space *mapping, __invalidate_mapping_pages(mapping, start, end, nr_pagevec); } +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -675,16 +684,28 @@ void invalidate_mapping_pagevec(struct address_space *mapping, * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +int page_unmap_invalidate(struct address_space *mapping, struct page *page, + gfp_t gfp) { unsigned long flags; + int ret; - if (page->mapping != mapping) - return 0; + VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + if (PageDirty(page)) return 0; + if (page_mapped(page)) + unmap_mapping_page(page); + BUG_ON(page_mapped(page)); + + ret = do_launder_page(mapping, page); + if (ret) + return ret; + if (page->mapping != mapping) + return -EBUSY; + + if (page_has_private(page) && !try_to_release_page(page, gfp)) + return -EBUSY; xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) @@ -712,15 +733,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 0; } -static int do_launder_page(struct address_space *mapping, struct page *page) -{ - if (!PageDirty(page)) - return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) - return 0; - return mapping->a_ops->launder_page(page); -} - /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -784,15 +796,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } wait_on_page_writeback(page); - if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); - - ret2 = do_launder_page(mapping, page); - if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) - ret2 = -EBUSY; - } + ret2 = page_unmap_invalidate(mapping, page, GFP_KERNEL); if (ret2 < 0) ret = ret2; unlock_page(page); -- Gitee From b4e6982aa64563e86948fb4cec08a11aba7ed3c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: [PATCH 06/11] fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag ANBZ: #27274 commit b9f958d4f146bd11be33a5f2bc3ced50f86d6b23 upstream. If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. [Backport Note] Now in 5.10 kernel, we don't have fop_flags in struct file_operations, so we add it and just use FOP_DONTCACHE only. We keep mmap_supported_flags for using. Link: https://lkml.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- include/linux/fs.h | 14 ++++++++++++++ include/uapi/linux/fs.h | 5 ++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 8e9d1c945dc2..bfaaa8465216 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -312,6 +312,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -1964,8 +1965,11 @@ struct dir_context { struct iov_iter; struct io_uring_cmd; +typedef unsigned int __bitwise fop_flags_t; + struct file_operations { struct module *owner; + fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); @@ -2015,6 +2019,8 @@ struct file_operations { CK_KABI_RESERVE(4) } __randomize_layout; +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); @@ -3462,6 +3468,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afd..8a505280508c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,8 +300,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_DONTCACHE) #endif /* _UAPI_LINUX_FS_H */ -- Gitee From 0598f74a16f25173df141842e44d1026417808c4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:46 -0700 Subject: [PATCH 07/11] mm/filemap: add read support for RWF_DONTCACHE ANBZ: #27274 commit 8026e49bff9b151609da4cae20e9da7f1833dde6 upstream. Add RWF_DONTCACHE as a read operation flag, which means that any data read wil be removed from the page cache upon completion. Uses the page cache to synchronize, and simply prunes folios that were instantiated when the operation completes. While it would be possible to use private pages for this, using the page cache as synchronization is handy for a variety of reasons: 1) No special truncate magic is needed 2) Async buffered reads need some place to serialize, using the page cache is a lot easier than writing extra code for this 3) The pruning cost is pretty reasonable and the code to support this is much simpler as a result. You can think of uncached buffered IO as being the much more attractive cousin of O_DIRECT - it has none of the restrictions of O_DIRECT. Yes, it will copy the data, but unlike regular buffered IO, it doesn't run into the unpredictability of the page cache in terms of reclaim. As an example, on a test box with 32 drives, reading them with buffered IO looks as follows: Reading bs 65536, uncached 0 1s: 145945MB/sec 2s: 158067MB/sec 3s: 157007MB/sec 4s: 148622MB/sec 5s: 118824MB/sec 6s: 70494MB/sec 7s: 41754MB/sec 8s: 90811MB/sec 9s: 92204MB/sec 10s: 95178MB/sec 11s: 95488MB/sec 12s: 95552MB/sec 13s: 96275MB/sec where it's quite easy to see where the page cache filled up, and performance went from good to erratic, and finally settles at a much lower rate. Looking at top while this is ongoing, we see: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 7535 root 20 0 267004 0 0 S 3199 0.0 8:40.65 uncached 3326 root 20 0 0 0 0 R 100.0 0.0 0:16.40 kswapd4 3327 root 20 0 0 0 0 R 100.0 0.0 0:17.22 kswapd5 3328 root 20 0 0 0 0 R 100.0 0.0 0:13.29 kswapd6 3332 root 20 0 0 0 0 R 100.0 0.0 0:11.11 kswapd10 3339 root 20 0 0 0 0 R 100.0 0.0 0:16.25 kswapd17 3348 root 20 0 0 0 0 R 100.0 0.0 0:16.40 kswapd26 3343 root 20 0 0 0 0 R 100.0 0.0 0:16.30 kswapd21 3344 root 20 0 0 0 0 R 100.0 0.0 0:11.92 kswapd22 3349 root 20 0 0 0 0 R 100.0 0.0 0:16.28 kswapd27 3352 root 20 0 0 0 0 R 99.7 0.0 0:11.89 kswapd30 3353 root 20 0 0 0 0 R 96.7 0.0 0:16.04 kswapd31 3329 root 20 0 0 0 0 R 96.4 0.0 0:11.41 kswapd7 3345 root 20 0 0 0 0 R 96.4 0.0 0:13.40 kswapd23 3330 root 20 0 0 0 0 S 91.1 0.0 0:08.28 kswapd8 3350 root 20 0 0 0 0 S 86.8 0.0 0:11.13 kswapd28 3325 root 20 0 0 0 0 S 76.3 0.0 0:07.43 kswapd3 3341 root 20 0 0 0 0 S 74.7 0.0 0:08.85 kswapd19 3334 root 20 0 0 0 0 S 71.7 0.0 0:10.04 kswapd12 3351 root 20 0 0 0 0 R 60.5 0.0 0:09.59 kswapd29 3323 root 20 0 0 0 0 R 57.6 0.0 0:11.50 kswapd1 [...] which is just showing a partial list of the 32 kswapd threads that are running mostly full tilt, burning ~28 full CPU cores. If the same test case is run with RWF_DONTCACHE set for the buffered read, the output looks as follows: Reading bs 65536, uncached 0 1s: 153144MB/sec 2s: 156760MB/sec 3s: 158110MB/sec 4s: 158009MB/sec 5s: 158043MB/sec 6s: 157638MB/sec 7s: 157999MB/sec 8s: 158024MB/sec 9s: 157764MB/sec 10s: 157477MB/sec 11s: 157417MB/sec 12s: 157455MB/sec 13s: 157233MB/sec 14s: 156692MB/sec which is just chugging along at ~155GB/sec of read performance. Looking at top, we see: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 7961 root 20 0 267004 0 0 S 3180 0.0 5:37.95 uncached 8024 axboe 20 0 14292 4096 0 R 1.0 0.0 0:00.13 top where just the test app is using CPU, no reclaim is taking place outside of the main thread. Not only is performance 65% better, it's also using half the CPU to do it. Link: https://lkml.kernel.org/r/20241220154831.1086649-9-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Weilin Tong --- mm/filemap.c | 31 +++++++++++++++++++++++++++++-- mm/swap.c | 3 +++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d4783139937b..4c316e11db8a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2383,6 +2383,8 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb, page = page_cache_alloc(mapping); if (!page) return ERR_PTR(-ENOMEM); + if (iocb->ki_flags & IOCB_DONTCACHE) + __SetPageDropbehind(page); error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); @@ -2418,6 +2420,9 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, return -EAGAIN; DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); nr_got = find_get_pages_contig(mapping, index, nr, pages); @@ -2445,6 +2450,9 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, break; } DEFINE_READAHEAD(ractl_async, filp, &filp->f_ra, mapping, pg_index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl_async.dropbehind = 1; + page_cache_async_ra(&ractl_async, page, last_index - pg_index); } @@ -2487,6 +2495,21 @@ static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page) return (pos1 >> shift == pos2 >> shift); } + +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct page *page) +{ + if (!PageDropbehind(page)) + return; + if (PageWriteback(page) || PageDirty(page)) + return; + if (trylock_page(page)) { + if (TestClearPageDropbehind(page)) + page_unmap_invalidate(mapping, page, 0); + unlock_page(page); + } +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2610,8 +2633,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } } put_pages: - for (i = 0; i < pg_nr; i++) - put_page(pages[i]); + for (i = 0; i < pg_nr; i++) { + struct page *page = pages[i]; + + filemap_end_dropbehind_read(mapping, page); + put_page(page); + } } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); diff --git a/mm/swap.c b/mm/swap.c index 299d09b2d285..86506c16d0b8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -458,6 +458,9 @@ void mark_page_accessed(struct page *page) { page = compound_head(page); + if (PageDropbehind(page)) + return; + if (lru_gen_enabled()) { page_inc_refs(page); return; -- Gitee From 0577bcd0f6969ba02f9e0885778a5cfafff4d512 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 19:47:37 +0800 Subject: [PATCH 08/11] mm/truncate: don't skip dirty page in folio_unmap_invalidate() ANBZ: #27274 ANBZ: #18912 cherry-picked from https://lore.kernel.org/all/20250218120209.88093-3-jefflexu@linux.alibaba.com/ ... otherwise this is a behavior change for the previous callers of invalidate_complete_folio2(), e.g. the page invalidation routine. Fixes: 4a9e23159fd3 ("mm/truncate: add folio_unmap_invalidate() helper") Signed-off-by: Jingbo Xu Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/4656 Signed-off-by: Weilin Tong --- mm/truncate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index c501acde923b..149aa122a04d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -692,8 +692,6 @@ int page_unmap_invalidate(struct address_space *mapping, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); - if (PageDirty(page)) - return 0; if (page_mapped(page)) unmap_mapping_page(page); BUG_ON(page_mapped(page)); -- Gitee From 28f06f23a9f2c917a559fe95c8f0f72a0681f56e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:40:00 -0700 Subject: [PATCH 09/11] xfs: flag as supporting FOP_DONTCACHE ANBZ: #27274 commit 974c5e6139db30fae668e44c381d13bcc63b65fa upstream. Read side was already fully supported, and with the write side appropriately punted to the worker queue, all that's needed now is setting FOP_DONTCACHE in the file_operations structure to enable full support for read and write uncached IO. This provides similar benefits to using RWF_DONTCACHE with reads. Testing buffered writes on 32 files: writing bs 65536, uncached 0 1s: 196035MB/sec 2s: 132308MB/sec 3s: 132438MB/sec 4s: 116528MB/sec 5s: 103898MB/sec 6s: 108893MB/sec 7s: 99678MB/sec 8s: 106545MB/sec 9s: 106826MB/sec 10s: 101544MB/sec 11s: 111044MB/sec 12s: 124257MB/sec 13s: 116031MB/sec 14s: 114540MB/sec 15s: 115011MB/sec 16s: 115260MB/sec 17s: 116068MB/sec 18s: 116096MB/sec where it's quite obvious where the page cache filled, and performance dropped from to about half of where it started, settling in at around 115GB/sec. Meanwhile, 32 kswapds were running full steam trying to reclaim pages. Running the same test with uncached buffered writes: writing bs 65536, uncached 1 1s: 198974MB/sec 2s: 189618MB/sec 3s: 193601MB/sec 4s: 188582MB/sec 5s: 193487MB/sec 6s: 188341MB/sec 7s: 194325MB/sec 8s: 188114MB/sec 9s: 192740MB/sec 10s: 189206MB/sec 11s: 193442MB/sec 12s: 189659MB/sec 13s: 191732MB/sec 14s: 190701MB/sec 15s: 191789MB/sec 16s: 191259MB/sec 17s: 190613MB/sec 18s: 191951MB/sec and the behavior is fully predictable, performing the same throughout even after the page cache would otherwise have fully filled with dirty data. It's also about 65% faster, and using half the CPU of the system compared to the normal buffered write. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-3-axboe@kernel.dk Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner Signed-off-by: Weilin Tong --- fs/xfs/xfs_file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ba9e2fa4f1c0..e2449295baaa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1473,6 +1473,7 @@ const struct file_operations xfs_file_operations = { .fallocate = xfs_file_fallocate, .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { -- Gitee From 1d514bb5a6a70b708a66813e16ffdde90d791690 Mon Sep 17 00:00:00 2001 From: Weilin Tong Date: Thu, 4 Dec 2025 16:04:50 +0800 Subject: [PATCH 10/11] anolis: mm: support uncached buffered read in ext4 ANBZ: #27274 Set FOP_DONTCACHE in ext4_file_operations to declare support for uncached buffered I/O. Ref: ae21c0c0ac56aa734327e9c8b7dfef4270ab54d4 upstream. Signed-off-by: Weilin Tong --- fs/ext4/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 56be85edd62f..41dac77e91e7 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -910,6 +910,7 @@ const struct file_operations ext4_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { -- Gitee From 0d55b79c089b4513ce2b2af8b56f69bd48de3c8f Mon Sep 17 00:00:00 2001 From: Weilin Tong Date: Thu, 4 Dec 2025 16:20:35 +0800 Subject: [PATCH 11/11] anolis: mm: gate dropbehind invalidate on page !dirty && !writeback ANBZ: #27274 It's possible for the page to either get marked for writeback or redirtied. Add a helper, filemap_end_dropbehind(), which guards the page_unmap_invalidate() call behind check for the page being both non-dirty and not under writeback AFTER the page lock has been acquired. Use this helper for read invalidation. Ref: commit 095f627add86a6ddda2c2cfd563b0ee05d0172b2 upstream. commit 25b065a744ff0c1099bb357be1c40030b5a14c07 upstream. Signed-off-by: Weilin Tong --- mm/filemap.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 4c316e11db8a..0916eb2f0999 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1589,6 +1589,16 @@ void page_endio(struct page *page, bool is_write, int err) } EXPORT_SYMBOL_GPL(page_endio); +static void filemap_end_dropbehind(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (mapping && !PageWriteback(page) && !PageDirty(page)) + page_unmap_invalidate(mapping, page, 0); +} + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @__page: the page to lock @@ -2496,8 +2506,7 @@ static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page) } -static void filemap_end_dropbehind_read(struct address_space *mapping, - struct page *page) +static void filemap_end_dropbehind_read(struct page *page) { if (!PageDropbehind(page)) return; @@ -2505,7 +2514,7 @@ static void filemap_end_dropbehind_read(struct address_space *mapping, return; if (trylock_page(page)) { if (TestClearPageDropbehind(page)) - page_unmap_invalidate(mapping, page, 0); + filemap_end_dropbehind(page); unlock_page(page); } } @@ -2636,7 +2645,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, for (i = 0; i < pg_nr; i++) { struct page *page = pages[i]; - filemap_end_dropbehind_read(mapping, page); + filemap_end_dropbehind_read(page); put_page(page); } } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); -- Gitee