From 84d578d49c0df03df0a076ce7edc590f903ecb1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 17:31:14 +0800 Subject: [PATCH 1/6] mm: add PSI accounting around ->read_folio and ->readahead calls mainline inclusion from mainline-v6.1-rc1 commit 176042404ee6a96ba7e9054e1bda6220360a26ad category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=176042404ee6a96ba7e9054e1bda6220360a26ad -------------------------------- PSI tries to account for the cost of bringing back in pages discarded by the MM LRU management. Currently the prime place for that is hooked into the bio submission path, which is a rather bad place: - it does not actually account I/O for non-block file systems, of which we have many - it adds overhead and a layering violation to the block layer Add the accounting into the two places in the core MM code that read pages into an address space by calling into ->read_folio and ->readahead so that the entire file system operations are covered, to broaden the coverage and allow removing the accounting in the block layer going forward. As psi_memstall_enter can deal with nested calls this will not lead to double accounting even while the bio annotations are still present. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de Signed-off-by: Jens Axboe Conflicts: mm/filemap.c mm/readahead.c [ Replace folio_test_workingset with PageWorkingset, and skip nonexistent function. ] Signed-off-by: Liu Shixin --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 7 +++++++ mm/readahead.c | 7 +++++++ 3 files changed, 16 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0bfa9cce6589..728550720e12 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -806,6 +806,8 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool _workingset; + unsigned long _pflags; }; #define DEFINE_READAHEAD(rac, f, m, i) \ diff --git a/mm/filemap.c b/mm/filemap.c index fd4aae06ff15..8beb7ccae51f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2294,6 +2294,8 @@ generic_file_buffered_read_readpage(struct kiocb *iocb, struct page *page) { struct file_ra_state *ra = &filp->f_ra; + bool workingset = PageWorkingset(page); + unsigned long pflags; int error; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { @@ -2308,8 +2310,13 @@ generic_file_buffered_read_readpage(struct kiocb *iocb, * PG_error will be set again if readpage fails. */ ClearPageError(page); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = mapping->a_ops->readpage(filp, page); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (unlikely(error)) { put_page(page); diff --git a/mm/readahead.c b/mm/readahead.c index ed23d5dec123..a9e6169cb371 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +126,8 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, if (!readahead_count(rac)) goto out; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -149,6 +152,9 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(!list_empty(pages)); BUG_ON(readahead_count(rac)); @@ -228,6 +234,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) SetPageReadahead(page); + ractl->_workingset |= PageWorkingset(page); ractl->_nr_pages++; } -- Gitee From e6203142310a2bda5fe5a3e36b44b660809211cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 17:31:15 +0800 Subject: [PATCH 2/6] sched/psi: export psi_memstall_{enter,leave} mainline inclusion from mainline-v6.1-rc1 commit 527eb453bbfe65e5a55a90edfb1f30b477e36b8c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=527eb453bbfe65e5a55a90edfb1f30b477e36b8c -------------------------------- To properly account for all refaults from file system logic, file systems need to call psi_memstall_enter directly, so export it. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-3-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Liu Shixin --- kernel/sched/psi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 11a43dccb7fc..33bf2afce83d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -942,6 +942,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -973,6 +974,7 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) -- Gitee From e30a207aeea8824cdf4dcf66c19f4ccb98b4eb1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 17:31:16 +0800 Subject: [PATCH 3/6] btrfs: add manual PSI accounting for compressed reads mainline inclusion from mainline-v6.1-rc1 commit 4088a47e78f95a5fea683cf67e0be006b13831fd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4088a47e78f95a5fea683cf67e0be006b13831fd -------------------------------- btrfs compressed reads try to always read the entire compressed chunk, even if only a subset is requested. Currently this is covered by the magic PSI accounting underneath submit_bio, but that is about to go away. Instead add manual psi_memstall_{enter,leave} annotations. Note that for readahead this really should be using readahead_expand, but the additionals reads are also done for plain ->read_folio where readahead_expand can't work, so this overall logic is left as-is for now. Signed-off-by: Christoph Hellwig Acked-by: David Sterba Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-4-hch@lst.de Signed-off-by: Jens Axboe Conflicts: fs/btrfs/compression.c Signed-off-by: Liu Shixin --- fs/btrfs/compression.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 646416b5940e..2c04dd767995 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -500,7 +501,8 @@ static u64 bio_end_offset(struct bio *bio) static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + unsigned long *pflags) { unsigned long end_index; unsigned long pg_index; @@ -549,6 +551,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } + if (PageWorkingset(page)) + psi_memstall_enter(pflags); + end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache @@ -630,6 +635,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; + /* Initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; @@ -688,7 +695,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -767,6 +774,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } + if (!pflags) + psi_memstall_leave(&pflags); + return 0; fail2: -- Gitee From 77ca330b816ef02ca178243a99a957c6583fab98 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 17:31:17 +0800 Subject: [PATCH 4/6] erofs: add manual PSI accounting for the compressed address space mainline inclusion from mainline-v6.1-rc1 commit 99486c511f686c799bb4e60b79d79808bb9440f4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=99486c511f686c799bb4e60b79d79808bb9440f4 -------------------------------- erofs uses an additional address space for compressed data read from disk in addition to the one directly associated with the inode. Reading into the lower address space is open coded using add_to_page_cache_lru instead of using the filemap.c helper for page allocation micro-optimizations, which means it is not covered by the MM PSI annotations for ->read_folio and ->readahead, so add manual ones instead. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Acked-by: Gao Xiang Link: https://lore.kernel.org/r/20220915094200.139713-5-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Liu Shixin --- fs/erofs/zdata.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8cb2cf612e49..004274bceae6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include +#include #include @@ -1165,6 +1166,8 @@ static void z_erofs_submit_queue(struct super_block *sb, pgoff_t last_index; unsigned int nr_bios = 0; struct bio *bio = NULL; + /* initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1203,10 +1206,15 @@ static void z_erofs_submit_queue(struct super_block *sb, if (bio && cur != last_index + 1) { submit_bio_retry: + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); bio = NULL; } + if (unlikely(PageWorkingset(page))) + psi_memstall_enter(&pflags); + if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); @@ -1234,8 +1242,11 @@ static void z_erofs_submit_queue(struct super_block *sb, move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); + } /* * although background is preferred, no one is pending for submission. -- Gitee From b23e65fd8dfe586d52866083a2d80466340fb907 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 17:31:18 +0800 Subject: [PATCH 5/6] block: remove PSI accounting from the bio layer mainline inclusion from mainline-v6.1-rc1 commit 118f3663fbc658e9ad6165e129076981c7b685c5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=118f3663fbc658e9ad6165e129076981c7b685c5 -------------------------------- PSI accounting is now done by the VM code, where it should have been since the beginning. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-6-hch@lst.de Signed-off-by: Jens Axboe Conflicts: block/bio.c block/blk-core.c Signed-off-by: Liu Shixin --- block/bio.c | 8 -------- block/blk-core.c | 19 ------------------- fs/direct-io.c | 2 -- include/linux/blk_types.h | 1 - 4 files changed, 30 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8c64c93e96c8..b28b6a51fb2c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -915,9 +915,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1108,9 +1105,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1136,8 +1130,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/block/blk-core.c b/block/blk-core.c index 4afdd568225b..502284017da1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include @@ -1108,24 +1107,6 @@ blk_qc_t submit_bio(struct bio *bio) } } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - blk_qc_t ret; - - psi_memstall_enter(&pflags); - ret = submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - - return ret; - } - return submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/fs/direct-io.c b/fs/direct-io.c index 9dafbb07dd6a..c64d4eb38995 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -426,8 +426,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1853ec569b72..ca4243cb9074 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -291,7 +291,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ -- Gitee From c689ddcb5b8b942c1200220d9a4b450809ece797 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 27 Oct 2023 17:31:19 +0800 Subject: [PATCH 6/6] fs: fix leaked psi pressure state mainline inclusion from mainline-v6.1-rc5 commit 82e60d00b753bb5cfecce22b8e952436b14d02a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCRJ CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=82e60d00b753bb5cfecce22b8e952436b14d02a3 -------------------------------- When psi annotations were added to to btrfs compression reads, the psi state tracking over add_ra_bio_pages and btrfs_submit_compressed_read was faulty. A pressure state, once entered, is never left. This results in incorrectly elevated pressure, which triggers OOM kills. pflags record the *previous* memstall state when we enter a new one. The code tried to initialize pflags to 1, and then optimize the leave call when we either didn't enter a memstall, or were already inside a nested stall. However, there can be multiple PageWorkingset pages in the bio, at which point it's that path itself that enters repeatedly and overwrites pflags. This causes us to miss the exit. Enter the stall only once if needed, then unwind correctly. erofs has the same problem, fix that up too. And move the memstall exit past submit_bio() to restore submit accounting originally added by b8e24a9300b0 ("block: annotate refault stalls from IO submission"). Link: https://lkml.kernel.org/r/Y2UHRqthNUwuIQGS@cmpxchg.org Fixes: 4088a47e78f9 ("btrfs: add manual PSI accounting for compressed reads") Fixes: 99486c511f68 ("erofs: add manual PSI accounting for the compressed address space") Fixes: 118f3663fbc6 ("block: remove PSI accounting from the bio layer") Link: https://lore.kernel.org/r/d20a0a85-e415-cf78-27f9-77dd7a94bc8d@leemhuis.info/ Signed-off-by: Johannes Weiner Reported-by: Thorsten Leemhuis Tested-by: Thorsten Leemhuis Cc: Chao Yu Cc: Chris Mason Cc: Christoph Hellwig Cc: David Sterba Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Conflicts: fs/btrfs/compression.c Signed-off-by: Liu Shixin --- fs/btrfs/compression.c | 14 ++++++++------ fs/erofs/zdata.c | 18 +++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2c04dd767995..a3ecff725688 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -502,7 +502,7 @@ static u64 bio_end_offset(struct bio *bio) static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb, - unsigned long *pflags) + int *memstall, unsigned long *pflags) { unsigned long end_index; unsigned long pg_index; @@ -551,8 +551,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - if (PageWorkingset(page)) + if (!*memstall && PageWorkingset(page)) { psi_memstall_enter(pflags); + *memstall = 1; + } end = last_offset + PAGE_SIZE - 1; /* @@ -635,8 +637,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; - /* Initialize to 1 to make skip psi_memstall_leave unless needed */ - unsigned long pflags = 1; + unsigned long pflags; + int memstall = 0; struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; @@ -695,7 +697,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); + add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -774,7 +776,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - if (!pflags) + if (memstall) psi_memstall_leave(&pflags); return 0; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 004274bceae6..92c41cdf256e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1166,8 +1166,8 @@ static void z_erofs_submit_queue(struct super_block *sb, pgoff_t last_index; unsigned int nr_bios = 0; struct bio *bio = NULL; - /* initialize to 1 to make skip psi_memstall_leave unless needed */ - unsigned long pflags = 1; + unsigned long pflags; + int memstall = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1206,14 +1206,18 @@ static void z_erofs_submit_queue(struct super_block *sb, if (bio && cur != last_index + 1) { submit_bio_retry: - if (!pflags) - psi_memstall_leave(&pflags); submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } bio = NULL; } - if (unlikely(PageWorkingset(page))) + if (unlikely(PageWorkingset(page)) && !memstall) { psi_memstall_enter(&pflags); + memstall = 1; + } if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); @@ -1243,9 +1247,9 @@ static void z_erofs_submit_queue(struct super_block *sb, } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - if (!pflags) - psi_memstall_leave(&pflags); submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); } /* -- Gitee