From 756b72691f67daf2e250406a664de8fde0e1f9a5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 21 Aug 2024 20:16:35 +0800 Subject: [PATCH 1/7] xfs: hoist multi-fsb allocation unit detection to a helper mainline inclusion from mainline-v6.10-rc1 commit 6b700a5be9b3b69419474622336c63fdc1cc3ca4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6b700a5be9b3b69419474622336c63fdc1cc3ca4 -------------------------------- Replace the open-coded logic to decide if a file has a multi-fsb allocation unit to a helper to make the code easier to read. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Conflicts: fs/xfs/xfs_bmap_util.c [ 5f57f7309d9ab9("xfs: create rt extent rounding helpers for realtime extent blocks") is not applied. ] Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_inode.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ad4aba5002c1..a38f1de3fd66 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -689,7 +689,7 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + if (xfs_inode_has_bigrtalloc(ip)) end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) @@ -990,7 +990,7 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + if (xfs_inode_has_bigrtalloc(ip)) { startoffset_fsb = roundup_64(startoffset_fsb, mp->m_sb.sb_rextsize); endoffset_fsb = rounddown_64(endoffset_fsb, diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..0d2c1e5046ed 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -305,6 +305,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } +/* + * Decide if this file is a realtime file whose data allocation unit is larger + * than a single filesystem block. + */ +static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +{ + return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; +} + /* * Return the buftarg used for data allocations on a given inode. */ -- Gitee From a3a80cd0d0bebe20c11f077c1f4fbf243b273b24 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 21 Aug 2024 20:16:36 +0800 Subject: [PATCH 2/7] xfs: reserve blocks for truncating large realtime inode mainline inclusion from mainline-v6.11-rc1 commit d048945150b798147b324f05f7e8c857772b0d3f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d048945150b798147b324f05f7e8c857772b0d3f -------------------------------- When unaligned truncate down a big realtime file, xfs_truncate_page() only zeros out the tail EOF block, __xfs_bunmapi() should split the tail written extent and convert the later one that beyond EOF block to unwritten, but it couldn't work as expected now since the reserved block is zero in xfs_setattr_size(), this could expose stale data just after commit '943bc0882ceb ("iomap: don't increase i_size if it's not a write operation")'. If we truncate file that contains a large enough written extent: |< rxext >|< rtext >| ...WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW ^ (new EOF) ^ old EOF Since we only zeros out the tail of the EOF block, and xfs_itruncate_extents()->..->__xfs_bunmapi() unmap the whole ailgned extents, it becomes this state: |< rxext >| ...WWWzWWWWWWWWWWWWW ^ new EOF Then if we do an extending write like this, the blocks in the previous tail extent becomes stale: |< rxext >| ...WWWzSSSSSSSSSSSSS..........WWWWWWWWWWWWWWWWW ^ old EOF ^ append start ^ new EOF Fix this by reserving XFS_DIOSTRAT_SPACE_RES blocks for big realtime inode. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20240618142112.1315279-2-yi.zhang@huaweicloud.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Conflicts: fs/xfs/xfs_iops.c [ 3fed24fffc76dd("xfs: Replace xfs_isilocked with xfs_assert_ilocked") is not applied. ] Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/xfs/xfs_iops.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b8ec045708c3..caba648e0ed2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,6 +17,8 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" @@ -794,6 +796,7 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; + uint resblks = 0; bool did_zeroing = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -901,7 +904,17 @@ xfs_setattr_size( return error; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + /* + * For realtime inode with more than one block rtextsize, we need the + * block reservation for bmap btree block allocations/splits that can + * happen since it could split the tail written extent and convert the + * right beyond EOF one to unwritten. + */ + if (xfs_inode_has_bigrtalloc(ip)) + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, 0, &tp); if (error) return error; -- Gitee From 37c3a17b4b4f86c2908791665b6a87b930460b15 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 21 Aug 2024 20:16:37 +0800 Subject: [PATCH 3/7] ext4: ext4_iomap_map_blocks: Fix null pointer deference in nojournal mode hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA -------------------------------- The 'journal' could be NULL in nojournal mode, which causes a null-ptr-def problem in ext4_iomap_map_blocks(). Fixes: 7f6416dcd4a3 ("ext4: implement writeback iomap path") Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9d5291db42b..9b877f4732f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3878,7 +3878,7 @@ static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc, * ext4_count_free_blocks() is non-zero, a commit * should free up blocks. */ - if (ret == -ENOSPC && ext4_count_free_clusters(sb)) { + if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) { jbd2_journal_force_commit_nested(journal); goto retry; } -- Gitee From b851791957c5f48f64f3139e2cd0e8de44069d23 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 21 Aug 2024 20:16:38 +0800 Subject: [PATCH 4/7] iomap: don't mark blocks uptodate after partial zeroing hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA -------------------------------- In __iomap_write_begin(), if we unaligned buffered write data to a hole of a regular file, we only zero out the place where aligned to block size that we don't want to write, but mark the whole range uptodate if block size < folio size. This is wrong since the not zeroed part will contains stale data and can be accessed by a concurrent buffered read easily (on the filesystem may not hold inode->i_rwsem) once we mark the range uptodate. At the same time, in the reading data branch, it's also unnecessary to set the just read range uptodate since we are going to set it immediately in __iomap_write_end(). Hence fix this by just drop iomap_set_range_uptodate() in the zeroing out branch. Fixes: 9dc55f1389f9 ("iomap: add support for sub-pagesize buffered I/O without buffer heads") Reported-by: Matthew Wilcox Closes: https://lore.kernel.org/all/ZqsN5ouQTEc1KAzV@casper.infradead.org/ Signed-off-by: Zhang Yi Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/iomap/buffered-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e705db1b53c9..42402a6de4eb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -761,7 +761,6 @@ int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (status) return status; } - iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; -- Gitee From b4cf4792d57107ec080dc6c1d57f98de6bcfdda9 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 21 Aug 2024 20:16:39 +0800 Subject: [PATCH 5/7] iomap: reduce unnecessary state_lock when setting ifs uptodate and dirty bits hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA -------------------------------- When doing buffered write, we set uptodate and drity bits of the written range separately, it holds the ifs->state_lock twice when blocksize < folio size, which is redundant. After large folio is supported, the spinlock could affect more about the performance, merge them could reduce some unnecessary locking overhead and gets some performance gain. Suggested-by: Dave Chinner Signed-off-by: Zhang Yi Reviewed-by: Darrick J. Wong Conflicts: fs/iomap/buffered-io.c [ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ] Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/iomap/buffered-io.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 42402a6de4eb..414b7bdde787 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -211,6 +211,37 @@ bool iomap_is_fully_dirty(struct folio *folio, size_t from, size_t count) } EXPORT_SYMBOL_GPL(iomap_is_fully_dirty); +static void ifs_set_range_dirty_uptodate(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk, nr_blks); + if (ifs_is_fully_uptodate(folio, ifs)) + folio_mark_uptodate(folio); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_dirty_uptodate(struct folio *folio, + size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty_uptodate(folio, ifs, off, len); + else + folio_mark_uptodate(folio); + + filemap_dirty_folio(folio->mapping, folio); +} + static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { @@ -867,6 +898,8 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { + size_t from = offset_in_folio(folio, pos); + flush_dcache_folio(folio); /* @@ -882,9 +915,8 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; - iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); - iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); - filemap_dirty_folio(inode->i_mapping, folio); + + iomap_set_range_dirty_uptodate(folio, from, copied); return true; } EXPORT_SYMBOL_GPL(__iomap_write_end); -- Gitee From a4f7f98e865fcf4103e6ff54f683c594b08858a1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 21 Aug 2024 20:16:40 +0800 Subject: [PATCH 6/7] iomap: optimize setting uptodate bit hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA -------------------------------- When overwriting an already uptodate folio, we don't need to set the range uptodate again. This could save a barrier and some bit operations if block size < folio size. Suggested-by: Matthew Wilcox Signed-off-by: Zhang Yi Conflicts: fs/iomap/buffered-io.c [ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ] Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/iomap/buffered-io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 414b7bdde787..09f410f5d637 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -916,7 +916,12 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; - iomap_set_range_dirty_uptodate(folio, from, copied); + if (folio_test_uptodate(folio)) { + iomap_set_range_dirty(folio, from, copied); + filemap_dirty_folio(folio->mapping, folio); + } else { + iomap_set_range_dirty_uptodate(folio, from, copied); + } return true; } EXPORT_SYMBOL_GPL(__iomap_write_end); -- Gitee From 9c47110538c51235159e0a1cbb7f183d9559849a Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 21 Aug 2024 20:16:41 +0800 Subject: [PATCH 7/7] iomap: improve iomap_folio_mkwrite_iter and ifs_clear_range_dirty hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA -------------------------------- Improve ifs_clear_range_dirty() by using shift operating. Improve iomap_folio_mkwrite_iter() by making folio dirty once. Signed-off-by: Zhihao Cheng Signed-off-by: Zhihao Cheng --- fs/iomap/buffered-io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 09f410f5d637..fe64886e4eae 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -136,7 +136,8 @@ static void ifs_clear_range_dirty(struct folio *folio, { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); - unsigned int first_blk = DIV_ROUND_UP(off, i_blocksize(inode)); + unsigned int first_blk = round_up(off, i_blocksize(inode)) >> + inode->i_blkbits; unsigned int last_blk = (off + len) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk; unsigned long flags; @@ -1559,7 +1560,6 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, ifs_alloc(iter->inode, folio, 0); iomap_set_range_dirty(folio, 0, length); - filemap_dirty_folio(iter->inode->i_mapping, folio); } return length; @@ -1583,6 +1583,8 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + if (iter.pos > folio_pos(folio)) + filemap_dirty_folio(folio->mapping, folio); if (ret < 0) goto out_unlock; folio_wait_stable(folio); -- Gitee