From 756b72691f67daf2e250406a664de8fde0e1f9a5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 21 Aug 2024 20:16:35 +0800
Subject: [PATCH 1/7] xfs: hoist multi-fsb allocation unit detection to a
 helper

mainline inclusion
from mainline-v6.10-rc1
commit 6b700a5be9b3b69419474622336c63fdc1cc3ca4
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6b700a5be9b3b69419474622336c63fdc1cc3ca4

--------------------------------

Replace the open-coded logic to decide if a file has a multi-fsb
allocation unit to a helper to make the code easier to read.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Conflicts:
	fs/xfs/xfs_bmap_util.c
[ 5f57f7309d9ab9("xfs: create rt extent rounding helpers for realtime
  extent blocks") is not applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/xfs/xfs_bmap_util.c | 4 ++--
 fs/xfs/xfs_inode.h     | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ad4aba5002c1..a38f1de3fd66 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -689,7 +689,7 @@ xfs_can_free_eofblocks(
 	 * forever.
 	 */
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
-	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+	if (xfs_inode_has_bigrtalloc(ip))
 		end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	if (last_fsb <= end_fsb)
@@ -990,7 +990,7 @@ xfs_free_file_space(
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
 	/* We can only free complete realtime extents. */
-	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+	if (xfs_inode_has_bigrtalloc(ip)) {
 		startoffset_fsb = roundup_64(startoffset_fsb,
 					     mp->m_sb.sb_rextsize);
 		endoffset_fsb = rounddown_64(endoffset_fsb,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3beb470f1892..0d2c1e5046ed 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -305,6 +305,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
 }
 
+/*
+ * Decide if this file is a realtime file whose data allocation unit is larger
+ * than a single filesystem block.
+ */
+static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
+{
+	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
-- 
Gitee


From a3a80cd0d0bebe20c11f077c1f4fbf243b273b24 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 21 Aug 2024 20:16:36 +0800
Subject: [PATCH 2/7] xfs: reserve blocks for truncating large realtime inode

mainline inclusion
from mainline-v6.11-rc1
commit d048945150b798147b324f05f7e8c857772b0d3f
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d048945150b798147b324f05f7e8c857772b0d3f

--------------------------------

When unaligned truncate down a big realtime file, xfs_truncate_page()
only zeros out the tail EOF block, __xfs_bunmapi() should split the tail
written extent and convert the later one that beyond EOF block to
unwritten, but it couldn't work as expected now since the reserved block
is zero in xfs_setattr_size(), this could expose stale data just after
commit '943bc0882ceb ("iomap: don't increase i_size if it's not a write
operation")'.

If we truncate file that contains a large enough written extent:

     |<    rxext    >|<    rtext    >|
  ...WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
        ^ (new EOF)      ^ old EOF

Since we only zeros out the tail of the EOF block, and
xfs_itruncate_extents()->..->__xfs_bunmapi() unmap the whole ailgned
extents, it becomes this state:

     |<    rxext    >|
  ...WWWzWWWWWWWWWWWWW
        ^ new EOF

Then if we do an extending write like this, the blocks in the previous
tail extent becomes stale:

     |<    rxext    >|
  ...WWWzSSSSSSSSSSSSS..........WWWWWWWWWWWWWWWWW
        ^ old EOF               ^ append start  ^ new EOF

Fix this by reserving XFS_DIOSTRAT_SPACE_RES blocks for big realtime
inode.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20240618142112.1315279-2-yi.zhang@huaweicloud.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Conflicts:
	fs/xfs/xfs_iops.c
[ 3fed24fffc76dd("xfs: Replace xfs_isilocked with xfs_assert_ilocked")
  is not applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/xfs/xfs_iops.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b8ec045708c3..caba648e0ed2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,8 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
 #include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
@@ -794,6 +796,7 @@ xfs_setattr_size(
 	struct xfs_trans	*tp;
 	int			error;
 	uint			lock_flags = 0;
+	uint			resblks = 0;
 	bool			did_zeroing = false;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -901,7 +904,17 @@ xfs_setattr_size(
 			return error;
 	}
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	/*
+	 * For realtime inode with more than one block rtextsize, we need the
+	 * block reservation for bmap btree block allocations/splits that can
+	 * happen since it could split the tail written extent and convert the
+	 * right beyond EOF one to unwritten.
+	 */
+	if (xfs_inode_has_bigrtalloc(ip))
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+				0, 0, &tp);
 	if (error)
 		return error;
 
-- 
Gitee


From 37c3a17b4b4f86c2908791665b6a87b930460b15 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 21 Aug 2024 20:16:37 +0800
Subject: [PATCH 3/7] ext4: ext4_iomap_map_blocks: Fix null pointer deference
 in nojournal mode

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

--------------------------------

The 'journal' could be NULL in nojournal mode, which causes a
null-ptr-def problem in ext4_iomap_map_blocks().

Fixes: 7f6416dcd4a3 ("ext4: implement writeback iomap path")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9d5291db42b..9b877f4732f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3878,7 +3878,7 @@ static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
 		 * ext4_count_free_blocks() is non-zero, a commit
 		 * should free up blocks.
 		 */
-		if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+		if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) {
 			jbd2_journal_force_commit_nested(journal);
 			goto retry;
 		}
-- 
Gitee


From b851791957c5f48f64f3139e2cd0e8de44069d23 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 21 Aug 2024 20:16:38 +0800
Subject: [PATCH 4/7] iomap: don't mark blocks uptodate after partial zeroing

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

--------------------------------

In __iomap_write_begin(), if we unaligned buffered write data to a hole
of a regular file, we only zero out the place where aligned to block
size that we don't want to write, but mark the whole range uptodate if
block size < folio size. This is wrong since the not zeroed part will
contains stale data and can be accessed by a concurrent buffered read
easily (on the filesystem may not hold inode->i_rwsem) once we mark the
range uptodate. At the same time, in the reading data branch, it's also
unnecessary to set the just read range uptodate since we are going to
set it immediately in __iomap_write_end(). Hence fix this by just drop
iomap_set_range_uptodate() in the zeroing out branch.

Fixes: 9dc55f1389f9 ("iomap: add support for sub-pagesize buffered I/O without buffer heads")
Reported-by: Matthew Wilcox <willy@infradead.org>
Closes: https://lore.kernel.org/all/ZqsN5ouQTEc1KAzV@casper.infradead.org/
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/iomap/buffered-io.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e705db1b53c9..42402a6de4eb 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -761,7 +761,6 @@ int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 			if (status)
 				return status;
 		}
-		iomap_set_range_uptodate(folio, poff, plen);
 	} while ((block_start += plen) < block_end);
 
 	return 0;
-- 
Gitee


From b4cf4792d57107ec080dc6c1d57f98de6bcfdda9 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 21 Aug 2024 20:16:39 +0800
Subject: [PATCH 5/7] iomap: reduce unnecessary state_lock when setting ifs
 uptodate and dirty bits

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

--------------------------------

When doing buffered write, we set uptodate and drity bits of the written
range separately, it holds the ifs->state_lock twice when blocksize <
folio size, which is redundant. After large folio is supported, the
spinlock could affect more about the performance, merge them could
reduce some unnecessary locking overhead and gets some performance gain.

Suggested-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Conflicts:
	fs/iomap/buffered-io.c
[ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/iomap/buffered-io.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 42402a6de4eb..414b7bdde787 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -211,6 +211,37 @@ bool iomap_is_fully_dirty(struct folio *folio, size_t from, size_t count)
 }
 EXPORT_SYMBOL_GPL(iomap_is_fully_dirty);
 
+static void ifs_set_range_dirty_uptodate(struct folio *folio,
+		struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = (off >> inode->i_blkbits);
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ifs->state_lock, flags);
+	bitmap_set(ifs->state, first_blk, nr_blks);
+	if (ifs_is_fully_uptodate(folio, ifs))
+		folio_mark_uptodate(folio);
+	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty_uptodate(struct folio *folio,
+		size_t off, size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs)
+		ifs_set_range_dirty_uptodate(folio, ifs, off, len);
+	else
+		folio_mark_uptodate(folio);
+
+	filemap_dirty_folio(folio->mapping, folio);
+}
+
 static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 		struct folio *folio, unsigned int flags)
 {
@@ -867,6 +898,8 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
+	size_t from = offset_in_folio(folio, pos);
+
 	flush_dcache_folio(folio);
 
 	/*
@@ -882,9 +915,8 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	 */
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return false;
-	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
-	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
-	filemap_dirty_folio(inode->i_mapping, folio);
+
+	iomap_set_range_dirty_uptodate(folio, from, copied);
 	return true;
 }
 EXPORT_SYMBOL_GPL(__iomap_write_end);
-- 
Gitee


From a4f7f98e865fcf4103e6ff54f683c594b08858a1 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 21 Aug 2024 20:16:40 +0800
Subject: [PATCH 6/7] iomap: optimize setting uptodate bit

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

--------------------------------

When overwriting an already uptodate folio, we don't need to set the
range uptodate again. This could save a barrier and some bit operations
if block size < folio size.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Conflicts:
	fs/iomap/buffered-io.c
[ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/iomap/buffered-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 414b7bdde787..09f410f5d637 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -916,7 +916,12 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return false;
 
-	iomap_set_range_dirty_uptodate(folio, from, copied);
+	if (folio_test_uptodate(folio)) {
+		iomap_set_range_dirty(folio, from, copied);
+		filemap_dirty_folio(folio->mapping, folio);
+	} else {
+		iomap_set_range_dirty_uptodate(folio, from, copied);
+	}
 	return true;
 }
 EXPORT_SYMBOL_GPL(__iomap_write_end);
-- 
Gitee


From 9c47110538c51235159e0a1cbb7f183d9559849a Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao@huaweicloud.com>
Date: Wed, 21 Aug 2024 20:16:41 +0800
Subject: [PATCH 7/7] iomap: improve iomap_folio_mkwrite_iter and
 ifs_clear_range_dirty

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z
CVE: NA

--------------------------------

Improve ifs_clear_range_dirty() by using shift operating. Improve
iomap_folio_mkwrite_iter() by making folio dirty once.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao@huaweicloud.com>
---
 fs/iomap/buffered-io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 09f410f5d637..fe64886e4eae 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -136,7 +136,8 @@ static void ifs_clear_range_dirty(struct folio *folio,
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
-	unsigned int first_blk = DIV_ROUND_UP(off, i_blocksize(inode));
+	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
+				 inode->i_blkbits;
 	unsigned int last_blk = (off + len) >> inode->i_blkbits;
 	unsigned int nr_blks = last_blk - first_blk;
 	unsigned long flags;
@@ -1559,7 +1560,6 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
 
 		ifs_alloc(iter->inode, folio, 0);
 		iomap_set_range_dirty(folio, 0, length);
-		filemap_dirty_folio(iter->inode->i_mapping, folio);
 	}
 
 	return length;
@@ -1583,6 +1583,8 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
 
+	if (iter.pos > folio_pos(folio))
+		filemap_dirty_folio(folio->mapping, folio);
 	if (ret < 0)
 		goto out_unlock;
 	folio_wait_stable(folio);
-- 
Gitee