From c804408112aef04354efc0bd6b3ab88d39449a0d Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 12 May 2025 15:33:22 +0800 Subject: [PATCH 1/7] anolis: ext4: add direct_IO for ext4_iomap_aops ANBZ: #20084 Add the missing .direct_IO for ext4_iomap_aops. Now just use the default noop_direct_IO like others. Fixes: e71c3e732afb ("ext4: add a new iomap aops for regular file's buffered IO path") Signed-off-by: Joseph Qi --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5082e11727f9..108e76e5becd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4022,6 +4022,7 @@ static const struct address_space_operations ext4_iomap_aops = { .bmap = ext4_bmap, .invalidate_folio = iomap_invalidate_folio, .release_folio = iomap_release_folio, + .direct_IO = noop_direct_IO, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, -- Gitee From a23326db09bf8d2cca55131ab7cddb61d4f3df2d Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 12 May 2025 16:04:21 +0800 Subject: [PATCH 2/7] anolis: ext4: writeback partial blocks before zeroing out range ANBZ: #20084 If we zero partial blocks, iomap_zero_iter() will skip zeroing out the IOMAP_UNWRITTEN srcmap, it works fine in xfs because this type means the block is pure unwritten and doesn't contain any delayed data. But it doesn't work in ext4, because IOMAP_UNWRITTEN may contain delayed data in ext4. For now it's hard to unify the meaning of this flag, so just fix it by writeback partial blocks before zeroing out. This is cherry-picked from: https://lore.kernel.org/linux-ext4/20240127015825.1608160-23-yi.zhang@huaweicloud.com/ Signed-off-by: Zhang Yi Signed-off-by: Joseph Qi --- fs/ext4/extents.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3b59ac862ff8..52b8c9d7536e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4567,6 +4567,16 @@ static long ext4_zero_range(struct file *file, loff_t offset, new_size, flags); if (ret) return ret; + + ret = filemap_write_and_wait_range(file->f_mapping, + round_down(offset, 1 << blkbits), offset); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(file->f_mapping, offset + len, + round_up((offset + len), 1 << blkbits)); + if (ret) + return ret; } ret = ext4_update_disksize_before_punch(inode, offset, len); -- Gitee From 2a6d59d4e2d3b1b0a52349d6fb74e338e06f9d14 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 12 May 2025 16:12:31 +0800 Subject: [PATCH 3/7] anolis: ext4: fall back to buffer_head path for defrag ANBZ: #20084 Online defrag doesn't support iomap path yet, we have to fall back to buffer_head path for the inode which has been using iomap. Changing active inode is dangerous, before we start, we must hold the inode lock and the mapping->invalidate_lock, and writeback all dirty folios and drop the inode's pagecache. This is cherry-picked from: https://lore.kernel.org/linux-ext4/20240127015825.1608160-24-yi.zhang@huaweicloud.com/ Fixes: 52925b270812 ("ext4: disable online defrag when inode using iomap buffered I/O path") Signed-off-by: Zhang Yi Signed-off-by: Joseph Qi --- fs/ext4/move_extent.c | 48 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d49c0b88597f..6a0d2824d4fb 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -545,6 +545,34 @@ mext_check_arguments(struct inode *orig_inode, return 0; } +/* + * Disable buffered iomap path for the inode that requiring move extents, + * fallback to buffer_head path. + */ +static int ext4_disable_buffered_iomap_aops(struct inode *inode) +{ + int err; + + /* + * The buffered_head aops don't know how to handle folios + * dirtied by iomap, so before falling back, flush all dirty + * folios the inode has. + */ + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; + } + truncate_inode_pages(inode->i_mapping, 0); + + ext4_clear_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP); + ext4_set_aops(inode); + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + /** * ext4_move_extents - Exchange the specified range of a file * @@ -609,13 +637,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } - if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP) || - ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) { - ext4_msg(orig_inode->i_sb, KERN_ERR, - "Online defrag not supported for inode with iomap buffered IO path"); - return -EOPNOTSUPP; - } - /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); @@ -623,6 +644,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); + /* Fallback to buffer_head aops for inodes with buffered iomap aops */ + if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP)) { + ret = ext4_disable_buffered_iomap_aops(orig_inode); + if (ret) + goto out_unlock; + } + if (ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) { + ret = ext4_disable_buffered_iomap_aops(donor_inode); + if (ret) + goto out_unlock; + } + /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ @@ -706,6 +739,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); +out_unlock: unlock_two_nondirectories(orig_inode, donor_inode); return ret; -- Gitee From 36fc6e017e688556d9211cc0aa688e01f2a65e40 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 12 May 2025 16:25:12 +0800 Subject: [PATCH 4/7] anolis: filemap: support disable large folios on active inode ANBZ: #20084 Since commit 730633f0b7f9 ("mm: Protect operations adding pages to page cache with invalidate_lock"), mapping->invalidate_lock can protect us from adding new folios into page cache. So it's possible to disable active inodes' large folios support, even through it might be dangerous. Filesystems can disable it under mapping->invalidate_lock and drop all page cache before dropping AS_LARGE_FOLIO_SUPPORT. This is cherry-picked from: https://lore.kernel.org/linux-ext4/20240127015825.1608160-26-yi.zhang@huaweicloud.com/ Signed-off-by: Zhang Yi Signed-off-by: Joseph Qi --- fs/ext4/move_extent.c | 1 + include/linux/pagemap.h | 14 ++++++++++++++ mm/readahead.c | 7 +++++++ 3 files changed, 22 insertions(+) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6a0d2824d4fb..26656d3d8e90 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -567,6 +567,7 @@ static int ext4_disable_buffered_iomap_aops(struct inode *inode) truncate_inode_pages(inode->i_mapping, 0); ext4_clear_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP); + mapping_clear_large_folios(inode->i_mapping); ext4_set_aops(inode); filemap_invalidate_unlock(inode->i_mapping); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 623159bc6cc8..28155f6b75ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -391,6 +391,20 @@ static inline void mapping_set_large_folios(struct address_space *mapping) __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/** + * mapping_clear_large_folios() - The file disable supports large folios. + * @mapping: The file. + * + * The filesystem have to make sure the file is in atomic context and all + * cached folios have been cleared under mapping->invalidate_lock before + * calling this function. + */ +static inline void mapping_clear_large_folios(struct address_space *mapping) +{ + WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock)); + __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. diff --git a/mm/readahead.c b/mm/readahead.c index 222f71c4d40e..1a93d750f610 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -549,6 +549,13 @@ void page_cache_ra_order(struct readahead_control *ractl, /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); + + if (unlikely(!mapping_large_folio_support(mapping))) { + filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); + goto fallback; + } + while (index <= limit) { unsigned int order = new_order; -- Gitee From f498b06cfe6f3b5cde6fadfbbddf4cb72d04953f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 14 May 2025 10:17:14 +0800 Subject: [PATCH 5/7] anolis: iomap: correct the range of a partial dirty clear ANBZ: #20084 The block range calculation in ifs_clear_range_dirty() is incorrect when partial clear a range in a folio. We can't clear the dirty bit of the first block or the last block if the start or end offset is blocksize unaligned, this has not yet caused any issue since we always clear a whole folio in iomap_writepage_map()->iomap_clear_range_dirty(). Fix this by round up the first block and round down the last block and correct the calculation of nr_blks. This is cherry-picked from: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I3UAW4Z5SJGRJFSK2MIOMPEGQG5P7SPP/ Signed-off-by: Zhang Yi Signed-off-by: Joseph Qi --- fs/iomap/buffered-io.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 2f91ac685a4d..ee5bd42fb927 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -135,11 +135,14 @@ static void ifs_clear_range_dirty(struct folio *folio, { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); - unsigned int first_blk = (off >> inode->i_blkbits); - unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; - unsigned int nr_blks = last_blk - first_blk + 1; + unsigned int first_blk = DIV_ROUND_UP(off, i_blocksize(inode)); + unsigned int last_blk = (off + len) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk; unsigned long flags; + if (!nr_blks) + return; + spin_lock_irqsave(&ifs->state_lock, flags); bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); spin_unlock_irqrestore(&ifs->state_lock, flags); -- Gitee From 9e722828baeff92b6be53a8be965403faf1e20be Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 13 May 2025 19:32:12 +0800 Subject: [PATCH 6/7] anolis: iomap: support invalidating partial folios ANBZ: #20084 Current iomap_invalidate_folio() could only invalidate an entire folio, if we truncate a partial folio on a filesystem with blocksize < folio size, it will left over the dirty bits of truncated/punched blocks, and the writeback process will try to map the invalid hole range, but fortunately it hasn't trigger any real problems now since ->map() will fix the length. Fix this by supporting invalidating partial folios. This is cherry-picked from: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I3UAW4Z5SJGRJFSK2MIOMPEGQG5P7SPP/ Signed-off-by: Zhang Yi Signed-off-by: Joseph Qi --- fs/iomap/buffered-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ee5bd42fb927..f41266202776 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -629,6 +629,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); ifs_free(folio); + } else { + iomap_clear_range_dirty(folio, offset, len); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); -- Gitee From 17411a9b8a164f4cd5a628fa28a14d28a34aca90 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 14 May 2025 16:16:26 +0800 Subject: [PATCH 7/7] anolis: ext4: don't allow remount with buffered iomap ANBZ: #20084 It's not safe to switch between buffer head and buffered iomap mode. So don't allow remount with buffered iomap now. Also remove nobuffered iomap option as of now since it has no usage. Fixes: 86f76a4b13aa ("ext4: introduce a mount option for iomap buffered I/O path") Signed-off-by: Joseph Qi --- fs/ext4/super.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 00591af033fa..a58f21a9c877 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1712,8 +1712,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, - Opt_buffered_iomap, Opt_nobuffered_iomap, + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_buffered_iomap, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force @@ -1857,7 +1856,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_flag ("buffered_iomap", Opt_buffered_iomap), - fsparam_flag ("nobuffered_iomap", Opt_nobuffered_iomap), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ @@ -1954,8 +1952,6 @@ static const struct mount_opts { MOPT_SET}, {Opt_buffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, - {Opt_nobuffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP, - MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, @@ -2442,6 +2438,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } return 0; + case Opt_buffered_iomap: + ext4_msg(NULL, KERN_WARNING, + "buffered iomap enabled. Waring: EXPERIMENTAL, use at your own risk"); + ctx_set_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP); + return 0; } /* @@ -2829,10 +2830,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc, return -EINVAL; } - if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP)) - ext4_msg(NULL, KERN_WARNING, - "Warning: mounting with an experimental option 'buffered_iomap'"); - err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; @@ -2876,6 +2873,12 @@ static int ext4_check_opt_consistency(struct fs_context *fc, !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } + + if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP) && + !test_opt2(sb, BUFFERED_IOMAP)) { + ext4_msg(NULL, KERN_ERR, "can't enable buffered iomap while remounting"); + return -EINVAL; + } } return ext4_check_quota_consistency(fc, sb); -- Gitee