diff --git a/block/fops.c b/block/fops.c index 0abaac705dafb08bf26078a7d3048c12916067c6..7e921f999182dc4619cb463e4766b88d317ec69f 100644 --- a/block/fops.c +++ b/block/fops.c @@ -467,7 +467,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a87cdb53198fb7b03e5f0c92767269f72b70c707..ffac6fc24cdd000304849f25e3753eaff5371b9f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -263,8 +263,10 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_DELAYED BIT(BH_Delay) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_DELAYED) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -1114,6 +1116,7 @@ struct ext4_inode_info { ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ + unsigned int i_es_seq; /* modify counter for extents */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -1145,6 +1148,8 @@ struct ext4_inode_info { */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; + struct list_head i_iomap_ioend_list; + struct work_struct i_iomap_ioend_work; atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1250,6 +1255,7 @@ struct ext4_inode_info { * scanning in mballoc */ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ +#define EXT4_MOUNT2_BUFFERED_IOMAP 0x00000200 /* Use iomap for buffered IO */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1910,6 +1916,7 @@ enum { EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ + EXT4_STATE_BUFFERED_IOMAP, /* Inode use iomap for buffered IO */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2966,6 +2973,8 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_use_buffered_iomap(struct inode *inode); +int ext4_nonda_switch(struct super_block *sb); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3750,6 +3759,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); +extern void ext4_iomap_end_io(struct work_struct *work); +extern void ext4_iomap_end_bio(struct bio *bio); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); @@ -3823,6 +3834,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; +extern const struct iomap_ops ext4_iomap_buffered_write_ops; +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d1a2e662440178e87c8240a1fa70f5cdf4731cef..94c8073b49e75c96de6012635cd381c335c7f45c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -11,6 +11,12 @@ int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* + * Ordered mode is no longer needed for the inode that use the + * iomap path, always use writeback mode. + */ + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb9e88e5a93bc0e409c84ca423668d792b72be17..9b522d5c7dc685244cbf3baa54d08f09166aae06 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3719,21 +3719,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested it is a clear sign that we still - * have some extent state machine issues left. So extent_split is still - * required. - * TODO: Once all related issues will be fixed this situation should be - * illegal. + /* + * For the inodes that use the buffered iomap path need to split + * extents in endio, other inodes not. + * + * TODO: Reserve enough sapce for splitting extents, always split + * extents here, and totally remove this warning. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG - ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u", - inode->i_ino, (unsigned long long)ee_block, ee_len, - (unsigned long long)map->m_lblk, map->m_len); + if (!ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) { + ext4_warning(inode->i_sb, + "Inode (%ld) finished: extent logical block %llu, " + "len %u; IO logical block %llu, len %u", + inode->i_ino, (unsigned long long)ee_block, + ee_len, (unsigned long long)map->m_lblk, + map->m_len); + } #endif err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT); + EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_METADATA_NOFAIL); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); @@ -4087,8 +4093,11 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, /* * The delalloc extent containing lblk, it must have been * added after ext4_map_blocks() checked the extent status - * tree, adjust the length to the delalloc extent's after - * lblk. + * tree so we are not holding i_rwsem and delalloc info is + * only stabilized by i_data_sem we are going to release + * soon. Don't modify the extent status tree and report + * extent as a hole, just adjust the length to the delalloc + * extent's after lblk. */ len = es.es_lblk + es.es_len - lblk; return len; @@ -4618,6 +4627,15 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_mutex; + ret = filemap_write_and_wait_range(mapping, + round_down(offset, 1 << blkbits), offset); + if (ret) + goto out_mutex; + + ret = filemap_write_and_wait_range(mapping, offset + len, + round_up((offset + len), 1 << blkbits)); + if (ret) + goto out_mutex; } /* Zero range excluding the unaligned edges */ diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index f4b50652f0ccea9fec831a46677958ee855188fc..45f7c46bc1ac4fad7a291da4c1809500eccebbc0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -204,6 +204,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es) return es->es_lblk + es->es_len - 1; } +static inline void ext4_es_inc_seq(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + WRITE_ONCE(ei->i_es_seq, READ_ONCE(ei->i_es_seq) + 1); +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -876,6 +883,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, WARN_ON(1); } + ext4_es_inc_seq(inode); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); @@ -1503,13 +1511,15 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) return; + ext4_es_inc_seq(inode); + trace_ext4_es_remove_extent(inode, lblk, len); + end = lblk + len - 1; BUG_ON(end < lblk); @@ -2049,34 +2059,43 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) } /* - * ext4_es_insert_delayed_block - adds a delayed block to the extents status - * tree, adding a pending reservation where - * needed + * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents + * status tree, adding a pending reservation + * where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added - * @allocated - indicates whether a physical cluster has been allocated for - * the logical cluster that contains the block + * @lblk - start logical block to be added + * @len - length of blocks to be added + * @lclu_allocated/end_allocated - indicates whether a physical cluster has + * been allocated for the logical cluster + * that contains the block */ -void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated) +void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated) { struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; - struct pending_reservation *pr = NULL; + struct pending_reservation *pr1 = NULL; + struct pending_reservation *pr2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", - lblk, inode->i_ino); + es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + if (!len) + return; + ext4_es_inc_seq(inode); newes.es_lblk = lblk; - newes.es_len = 1; + newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); - trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, + end_allocated); ext4_es_insert_extent_check(inode, &newes); @@ -2085,11 +2104,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); - if ((err1 || err2 || err3) && allocated && !pr) - pr = __alloc_pending(true); + if (err1 || err2 || err3) { + if (lclu_allocated && !pr1) + pr1 = __alloc_pending(true); + if (end_allocated && !pr2) + pr2 = __alloc_pending(true); + } write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -2109,13 +2132,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, es2 = NULL; } - if (allocated) { - err3 = __insert_pending(inode, lblk, &pr); + if (lclu_allocated) { + err3 = __insert_pending(inode, lblk, &pr1); if (err3 != 0) goto error; - if (pr) { - __free_pending(pr); - pr = NULL; + if (pr1) { + __free_pending(pr1); + pr1 = NULL; + } + } + if (end_allocated) { + err3 = __insert_pending(inode, end, &pr2); + if (err3 != 0) + goto error; + if (pr2) { + __free_pending(pr2); + pr2 = NULL; } } error: diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d9847a4a25dbaa73a1a6069052fb2fb1563a7ecc..3c8e2edee5d5d1e77af891530e9e5703ec56d0cc 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -249,8 +249,9 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); -extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated); +extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, bool lclu_allocated, + bool end_allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 324b45b51d1fbda74d2218cadfedbd7cdd626faf..67b0e2212ca0bf736b8f7b65a7d4efef28e5a18e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -286,6 +286,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return count; } +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const struct iomap_ops *iomap_ops; + + if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) + iomap_ops = &ext4_iomap_buffered_da_write_ops; + else + iomap_ops = &ext4_iomap_buffered_write_ops; + + return iomap_file_buffered_write(iocb, from, iomap_ops); +} + static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -300,7 +314,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - ret = generic_perform_write(iocb, from); + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + ret = ext4_iomap_buffered_write(iocb, from); + else + ret = generic_perform_write(iocb, from); out: inode_unlock(inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b65058d972f95646fa0d629a96a36836ea86076a..a72c7167c33fc87001553dcc4e369b7f8edf709b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1336,6 +1336,11 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, } } + if (ext4_should_use_buffered_iomap(inode)) { + ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP); + mapping_set_large_folios(inode->i_mapping); + } + if (ext4_handle_valid(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; ei->i_datasync_tid = handle->h_transaction->t_tid; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7732320431ac780879a86f74ce87e947c26c3a4..93a9dd03cb5c9c9e22e0ea5de944656c85742ef6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -43,6 +43,7 @@ #include #include "ext4_jbd2.h" +#include "ext4_extents.h" #include "xattr.h" #include "acl.h" #include "truncate.h" @@ -453,6 +454,115 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map) +{ + unsigned int status; + int retval; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, 0); + else + retval = ext4_ind_map_blocks(handle, inode, map, 0); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + return retval; +} + +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -515,6 +625,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; + map->m_flags |= ext4_es_is_delayed(&es) ? + EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; @@ -599,12 +711,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; - /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; - /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -612,76 +718,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); - - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } - -out_sem: + retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -756,6 +793,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, if (ext4_has_inline_data(inode)) return -ERANGE; + if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))) + return -EINVAL; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; @@ -1450,9 +1489,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1463,18 +1502,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1621,24 +1660,56 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been delayed or allocated, + * if not, it means we should reserve a cluster when add delalloc, return 1, + * otherwise return 0 or error code. + */ +static int ext4_da_check_clu_allocated(struct inode *inode, ext4_lblk_t lblk, + bool *allocated) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + *allocated = false; + if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) + return 0; + + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + goto allocated; + + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret == 0) + return 1; +allocated: + *allocated = true; + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int ret; - bool allocated = false; + int resv_clu, ret; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1649,31 +1720,38 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; - } + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) - 1; + if (resv_clu < 0) + resv_clu = 0; + + ret = ext4_da_check_clu_allocated(inode, lblk, &lclu_allocated); + if (ret < 0) + return ret; + if (ret > 0) + resv_clu++; + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_da_check_clu_allocated(inode, end, + &end_allocated); + if (ret < 0) + return ret; + if (ret > 0) + resv_clu++; + } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } @@ -1681,52 +1759,41 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * delalloc extent map under the protection of i_data_sem. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { - if (ext4_es_is_hole(&es)) { - retval = 0; - down_read(&EXT4_I(inode)->i_data_sem); + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (ext4_es_is_hole(&es)) goto add_delayed; - } +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1737,7 +1804,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1747,48 +1814,38 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_map_query_blocks(NULL, inode, map); + up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; add_delayed: - if (retval == 0) { - int ret; - - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ + down_write(&EXT4_I(inode)->i_data_sem); + /* + * Lookup extents tree again under i_data_sem, make sure this + * inserting delalloc range haven't been delayed or allocated + * whitout holding i_rwsem and folio lock. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); - ret = ext4_insert_delayed_block(inode, map->m_lblk); - if (ret != 0) { - retval = ret; - goto out_unlock; + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; } - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } else if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); } -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); + up_write(&EXT4_I(inode)->i_data_sem); return retval; } @@ -1809,11 +1866,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1822,10 +1883,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -2533,6 +2601,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) trace_ext4_writepages(inode, wbc); + if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))) + return -EINVAL; + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2824,7 +2895,7 @@ static int ext4_dax_writepages(struct address_space *mapping, return ret; } -static int ext4_nonda_switch(struct super_block *sb) +int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3220,6 +3291,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap) +{ + return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq); +} + +static const struct iomap_folio_ops ext4_iomap_folio_ops = { + .iomap_valid = ext4_iomap_valid, +}; + static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) @@ -3232,9 +3312,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, * there is no other metadata changes being made or are pending. */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode) || - offset + length > i_size_read(inode)) - iomap->flags |= IOMAP_F_DIRTY; + if ((flags & (IOMAP_DAX | IOMAP_REPORT)) || + ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == + (IOMAP_WRITE | IOMAP_DIRECT))) { + if (offset + length > i_size_read(inode) || + ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; + } if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; @@ -3250,6 +3334,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; + iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq); + iomap->folio_ops = &ext4_iomap_folio_ops; + /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits @@ -3269,6 +3356,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; + } else if (map->m_flags & EXT4_MAP_DELAYED) { + iomap->type = IOMAP_DELALLOC; + iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; @@ -3431,35 +3521,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_end = ext4_iomap_end, }; -static bool ext4_iomap_is_delalloc(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct extent_status es; - ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map->m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) - return false; - - if (es.es_lblk > map->m_lblk) { - map->m_len = es.es_lblk - map->m_lblk; - return false; - } - - offset = map->m_lblk - es.es_lblk; - map->m_len = es.es_len - offset; - - return true; -} - static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; - bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; @@ -3500,13 +3566,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; - if (ret == 0) - delalloc = ext4_iomap_is_delalloc(inode, &map); - set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); - if (delalloc && iomap->type == IOMAP_HOLE) - iomap->type = IOMAP_DELALLOC; return 0; } @@ -3515,6 +3576,385 @@ const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; +static int ext4_iomap_get_blocks(struct inode *inode, + struct ext4_map_blocks *map) +{ + handle_t *handle; + int ret, needed_blocks; + + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason. + */ + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + /* + * Have to stop journal here since there is a potential deadlock + * caused by later balance_dirty_pages(), it might wait on the + * ditry pages to be written back, which might start another + * handle and wait this handle stop. + */ + ext4_journal_stop(handle); + + return ret; +} + +#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE + +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int iomap_flags, + struct iomap *iomap, struct iomap *srcmap, + bool delalloc) +{ + int ret, retries = 0; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; +retry: + /* Calculate the first and last logical blocks respectively. */ + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + if (iomap_flags & IOMAP_WRITE) { + if (delalloc) + ret = ext4_da_map_blocks(inode, &map); + else + ret = ext4_iomap_get_blocks(inode, &map); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } else { + ret = ext4_map_blocks(NULL, inode, &map, 0); + } + if (ret < 0) + return ret; + + ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags); + if (delalloc) + iomap->flags |= IOMAP_F_EXT4_DELALLOC; + + return 0; +} + +static inline int ext4_iomap_buffered_io_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags, + iomap, srcmap, false); +} + +static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags, + iomap, srcmap, true); +} + +/* + * Drop the staled delayed allocation range from the write failure, + * including both start and end blocks. If not, we could leave a range + * of delayed extents covered by a clean folio, it could lead to + * inaccurate space reservation. + */ +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset, + loff_t length) +{ + ext4_es_remove_extent(inode, offset >> inode->i_blkbits, + DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb))); + return 0; +} + +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset, + loff_t length, ssize_t written, + unsigned int flags, + struct iomap *iomap) +{ + handle_t *handle; + loff_t end; + int ret = 0, ret2; + + /* delalloc */ + if (iomap->flags & IOMAP_F_EXT4_DELALLOC) { + ret = iomap_file_buffered_write_punch_delalloc(inode, iomap, + offset, length, written, ext4_iomap_punch_delalloc); + if (ret) + ext4_warning(inode->i_sb, + "Failed to clean up delalloc for inode %lu, %d", + inode->i_ino, ret); + return ret; + } + + /* nodelalloc */ + end = offset + length; + if (!(iomap->flags & IOMAP_F_SIZE_CHANGED) && end <= inode->i_size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (iomap->flags & IOMAP_F_SIZE_CHANGED) { + ext4_update_i_disksize(inode, inode->i_size); + ret = ext4_mark_inode_dirty(handle, inode); + } + + /* + * If we have allocated more blocks and copied less. + * We will have blocks allocated outside inode->i_size, + * so truncate them. + */ + if (end > inode->i_size) + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + ret = ret ? : ret2; + + if (end > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret; +} + + +const struct iomap_ops ext4_iomap_buffered_write_ops = { + .iomap_begin = ext4_iomap_buffered_io_begin, + .iomap_end = ext4_iomap_buffered_write_end, +}; + +const struct iomap_ops ext4_iomap_buffered_da_write_ops = { + .iomap_begin = ext4_iomap_buffered_da_write_begin, + .iomap_end = ext4_iomap_buffered_write_end, +}; + +const struct iomap_ops ext4_iomap_buffered_read_ops = { + .iomap_begin = ext4_iomap_buffered_io_begin, +}; + +static int ext4_iomap_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &ext4_iomap_buffered_read_ops); +} + +static void ext4_iomap_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &ext4_iomap_buffered_read_ops); +} + +struct ext4_writeback_ctx { + struct iomap_writepage_ctx ctx; + struct writeback_control *wbc; + unsigned int data_seq; +}; + +static int ext4_iomap_map_one_extent(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + handle_t *handle = NULL; + int credits, map_flags; + int retval; + + credits = ext4_da_writepages_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + map->m_flags = 0; + /* + * In order to protect from the race of truncate, we have to lookup + * extent stats and map blocks under i_data_sem, otherwise the + * delalloc extent could be stale. + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (likely(ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es))) { + retval = es.es_len - (map->m_lblk - es.es_lblk); + map->m_len = min_t(unsigned int, retval, map->m_len); + + if (likely(ext4_es_is_delonly(&es))) { + trace_ext4_da_write_pages_extent(inode, map); + /* + * Call ext4_map_create_blocks() to allocate any delayed + * allocation blocks. It is possible that we're going to + * need more metadata blocks, however we must not fail + * because we're in writeback and there is nothing we + * can do so it might result in data loss. So use + * reserved blocks to allocate metadata if possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + retval = ext4_map_create_blocks(handle, inode, map, + map_flags); + } + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - + es.es_lblk; + map->m_flags = ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + } + } else { + retval = ext4_map_query_blocks(handle, inode, map); + } + + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + return retval < 0 ? retval : 0; +} + +static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset, + unsigned int dirty_len) +{ + struct ext4_writeback_ctx *ewpc = + container_of(wpc, struct ext4_writeback_ctx, ctx); + struct super_block *sb = inode->i_sb; + struct journal_s *journal = EXT4_SB(sb)->s_journal; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int index = offset >> blkbits; + unsigned int end, len; + int ret; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + /* Check validity of the cached writeback mapping. */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length && + ewpc->data_seq == READ_ONCE(ei->i_es_seq)) + return 0; + + end = min_t(unsigned int, + (ewpc->wbc->range_end >> blkbits), (UINT_MAX - 1)); + len = (end > index + dirty_len) ? end - index + 1 : dirty_len; + +retry: + map.m_lblk = index; + map.m_len = min_t(unsigned int, MAX_WRITEPAGES_EXTENT_LEN, len); + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + + /* + * The map isn't a delalloc extent, it must be a hole or have + * already been allocated. + */ + if (!(map.m_flags & EXT4_MAP_DELAYED)) + goto out; + + /* Map one delalloc extent. */ + ret = ext4_iomap_map_one_extent(inode, &map); + if (ret < 0) { + if (ext4_forced_shutdown(sb)) + return ret; + + /* + * Retry transient ENOSPC errors, if + * ext4_count_free_blocks() is non-zero, a commit + * should free up blocks. + */ + if (ret == -ENOSPC && ext4_count_free_clusters(sb)) { + jbd2_journal_force_commit_nested(journal); + goto retry; + } + + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with " + "max blocks %u with error %d", + inode->i_ino, (unsigned long long)map.m_lblk, + (unsigned int)map.m_len, -ret); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (ret == -ENOSPC) + ext4_print_free_blocks(inode); + return ret; + } +out: + ewpc->data_seq = READ_ONCE(ei->i_es_seq); + ext4_set_iomap(inode, &wpc->iomap, &map, offset, + map.m_len << blkbits, 0); + return 0; +} + +static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status) +{ + struct ext4_inode_info *ei = EXT4_I(ioend->io_inode); + + /* Need to convert unwritten extents when I/Os are completed. */ + if (ioend->io_type == IOMAP_UNWRITTEN || + ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize)) + ioend->io_bio.bi_end_io = ext4_iomap_end_bio; + + return status; +} + +static void ext4_iomap_discard_folio(struct folio *folio, loff_t pos) +{ + struct inode *inode = folio->mapping->host; + + ext4_iomap_punch_delalloc(inode, pos, + folio_pos(folio) + folio_size(folio) - pos); +} + +static const struct iomap_writeback_ops ext4_writeback_ops = { + .map_blocks = ext4_iomap_map_blocks, + .prepare_ioend = ext4_iomap_prepare_ioend, + .discard_folio = ext4_iomap_discard_folio, +}; + +static int ext4_iomap_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + long nr = wbc->nr_to_write; + int alloc_ctx, ret; + struct ext4_writeback_ctx ewpc = { + .wbc = wbc, + }; + + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + + alloc_ctx = ext4_writepages_down_read(sb); + trace_ext4_writepages(inode, wbc); + ret = iomap_writepages(mapping, wbc, &ewpc.ctx, &ext4_writeback_ops); + trace_ext4_writepages_result(inode, wbc, ret, nr - wbc->nr_to_write); + ext4_writepages_up_read(sb, alloc_ctx); + + return ret; +} + /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the @@ -3604,6 +4044,21 @@ static const struct address_space_operations ext4_da_aops = { .swap_activate = ext4_iomap_swap_activate, }; +static const struct address_space_operations ext4_iomap_aops = { + .read_folio = ext4_iomap_read_folio, + .readahead = ext4_iomap_readahead, + .writepages = ext4_iomap_writepages, + .dirty_folio = iomap_dirty_folio, + .bmap = ext4_bmap, + .invalidate_folio = iomap_invalidate_folio, + .release_folio = iomap_release_folio, + .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .swap_activate = ext4_iomap_swap_activate, +}; + static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, @@ -3626,6 +4081,8 @@ void ext4_set_aops(struct inode *inode) } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; + else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + inode->i_mapping->a_ops = &ext4_iomap_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else @@ -3727,6 +4184,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, return err; } +static int ext4_iomap_zero_range(struct inode *inode, + loff_t from, loff_t length) +{ + return iomap_zero_range(inode, from, length, NULL, + &ext4_iomap_buffered_read_ops); +} + /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must @@ -3752,6 +4216,8 @@ static int ext4_block_zero_page_range(handle_t *handle, if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); + } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) { + return ext4_iomap_zero_range(inode, from, length); } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -4012,12 +4478,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) /* If there are blocks to remove, do it */ if (stop_block > first_block) { + ext4_lblk_t hole_len = stop_block - first_block; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); - ext4_es_remove_extent(inode, first_block, - stop_block - first_block); + ext4_es_remove_extent(inode, first_block, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, @@ -4026,6 +4492,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); + ext4_es_insert_extent(inode, first_block, hole_len, ~0, + EXTENT_STATUS_HOLE); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); @@ -4679,6 +5147,32 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) return NULL; } +bool ext4_should_use_buffered_iomap(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!test_opt2(sb, BUFFERED_IOMAP)) + return false; + if (ext4_has_feature_inline_data(sb)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return false; + if (!S_ISREG(inode->i_mode)) + return false; + if (IS_DAX(inode)) + return false; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) + return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) + return false; + + return true; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4943,6 +5437,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (ret) goto bad_inode; + if (ext4_should_use_buffered_iomap(inode)) { + ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP); + mapping_set_large_folios(inode->i_mapping); + } + if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; @@ -6064,6 +6563,26 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + const struct iomap_ops *iomap_ops; + + /* + * ext4_nonda_switch() could writeback this folio, so have to + * call it before lock folio. + * + * TODO: drop ext4_nonda_switch() after reserving enough sapce + * for metadata and merge delalloc and nodelalloc operations. + */ + if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) + iomap_ops = &ext4_iomap_buffered_da_write_ops; + else + iomap_ops = &ext4_iomap_buffered_write_ops; + + return iomap_page_mkwrite(vmf, iomap_ops); +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6087,6 +6606,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) filemap_invalidate_lock_shared(mapping); + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) { + ret = ext4_iomap_page_mkwrite(vmf); + goto out; + } + err = ext4_convert_inline_data(inode); if (err) goto out_ret; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e6976716e85d4d5e957f1293cdbe5619ca4eebd6..f58c0cb4dd761b05dfd50b2c4137eea9bbe77245 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -539,6 +539,35 @@ mext_check_arguments(struct inode *orig_inode, return 0; } +/* + * Disable buffered iomap path for the inode that requiring move extents, + * fallback to buffer_head path. + */ +static int ext4_disable_buffered_iomap_aops(struct inode *inode) +{ + int err; + + /* + * The buffered_head aops don't know how to handle folios + * dirtied by iomap, so before falling back, flush all dirty + * folios the inode has. + */ + filemap_invalidate_lock(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + filemap_invalidate_unlock(inode->i_mapping); + return err; + } + truncate_inode_pages(inode->i_mapping, 0); + + ext4_clear_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP); + mapping_clear_large_folios(inode->i_mapping); + ext4_set_aops(inode); + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + /** * ext4_move_extents - Exchange the specified range of a file * @@ -610,6 +639,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); + /* Fallback to buffer_head aops for inodes with buffered iomap aops */ + if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP)) + ext4_disable_buffered_iomap_aops(orig_inode); + if (ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP)) + ext4_disable_buffered_iomap_aops(donor_inode); + /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5cf0389088a70ed549ce744496dcd83830..a499208500e4ba8a40e4ec4895c84cdff11f0464 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -565,3 +566,109 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, return 0; } + +static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend) +{ + struct inode *inode = ioend->io_inode; + struct ext4_inode_info *ei = EXT4_I(inode); + loff_t pos = ioend->io_offset; + size_t size = ioend->io_size; + loff_t new_disksize; + handle_t *handle; + int credits; + int ret, err; + + ret = blk_status_to_errno(ioend->io_bio.bi_status); + if (unlikely(ret)) + goto out; + + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + credits = ext4_chunk_trans_blocks(inode, + EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits)); + handle = ext4_journal_start(inode, EXT4_HT_EXT_CONVERT, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_err; + } + + if (ioend->io_type == IOMAP_UNWRITTEN) { + ret = ext4_convert_unwritten_extents(handle, inode, pos, size); + if (ret) + goto out_journal; + } + + /* + * Update on-disk size after IO is completed. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + new_disksize = pos + size; + if (new_disksize > READ_ONCE(ei->i_disksize)) { + down_write(&ei->i_data_sem); + new_disksize = min(new_disksize, i_size_read(inode)); + if (new_disksize > ei->i_disksize) + ei->i_disksize = new_disksize; + up_write(&ei->i_data_sem); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + EXT4_ERROR_INODE_ERR(inode, -ret, + "Failed to mark inode dirty"); + } + +out_journal: + err = ext4_journal_stop(handle); + if (!ret) + ret = err; +out_err: + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to " + "written extents or update inode size -- " + "potential data loss! (inode %lu, error %d)", + inode->i_ino, ret); + } +out: + iomap_finish_ioends(ioend, ret); +} + +/* + * Work on buffered iomap completed IO, to convert unwritten extents to + * mapped extents + */ +void ext4_iomap_end_io(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_iomap_ioend_work); + struct iomap_ioend *ioend; + struct list_head ioend_list; + unsigned long flags; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_replace_init(&ei->i_iomap_ioend_list, &ioend_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + iomap_sort_ioends(&ioend_list); + while (!list_empty(&ioend_list)) { + ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + iomap_ioend_try_merge(ioend, &ioend_list); + ext4_iomap_finish_ioend(ioend); + } +} + +void ext4_iomap_end_bio(struct bio *bio) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + struct ext4_inode_info *ei = EXT4_I(ioend->io_inode); + struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb); + unsigned long flags; + + /* Only reserved conversions from writeback should enter here */ + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&ei->i_iomap_ioend_list)) + queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work); + list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb6e68cfe1f52090714d086ff96002e06ec2a90e..e117ff52cf239196b36f3cb4ba799d53969b4c75 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1474,6 +1474,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; + ei->i_es_seq = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); @@ -1483,11 +1484,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + INIT_LIST_HEAD(&ei->i_iomap_ioend_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; @@ -1736,6 +1739,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, + Opt_buffered_iomap, Opt_nobuffered_iomap, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force @@ -1878,6 +1882,9 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), + fsparam_flag ("buffered_iomap", Opt_buffered_iomap), + fsparam_flag ("nobuffered_iomap", Opt_nobuffered_iomap), + fsparam_u32 ("buffered_iomap", Opt_buffered_iomap), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ @@ -1972,6 +1979,10 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_buffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP, + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, + {Opt_nobuffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP, + MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, @@ -2897,6 +2908,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } + + if ((ctx->mask_s_mount_opt2 & EXT4_MOUNT2_BUFFERED_IOMAP) && + (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP) != + test_opt2(sb, BUFFERED_IOMAP))) { + ext4_msg(NULL, KERN_ERR, "can't change buffered IO mode on remount"); + return -EINVAL; + } } return ext4_check_quota_consistency(fc, sb); @@ -4464,6 +4482,9 @@ static void ext4_set_def_opts(struct super_block *sb, if (sb->s_blocksize == PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); + + /* Use iomap for buffered IO path instead of buffer_head */ + set_opt2(sb, BUFFERED_IOMAP); } static int ext4_handle_clustersize(struct super_block *sb) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ef7017fb69512c589c91f3e78ae7e758550b0006..8e8af99b3dad8ae41ad9375bd81911e6c1de7bfc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2462,7 +2462,7 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, unsigned int len) { int ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 2bc0aa23fde3b940427b9c32533fc54f6c075e4d..fee0bb9b5d7583c6118986d732a2cac496db1f3a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (C) 2016-2019 Christoph Hellwig. + * Copyright (C) 2016-2023 Christoph Hellwig. */ #include #include @@ -93,6 +93,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio, return test_bit(block + blks_per_folio, ifs->state); } +static unsigned ifs_find_dirty_range(struct folio *folio, + struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) +{ + struct inode *inode = folio->mapping->host; + unsigned start_blk = + offset_in_folio(folio, *range_start) >> inode->i_blkbits; + unsigned end_blk = min_not_zero( + offset_in_folio(folio, range_end) >> inode->i_blkbits, + i_blocks_per_folio(inode, folio)); + unsigned nblks = 1; + + while (!ifs_block_is_dirty(folio, ifs, start_blk)) + if (++start_blk == end_blk) + return 0; + + while (start_blk + nblks < end_blk) { + if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) + break; + nblks++; + } + + *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); + return nblks << inode->i_blkbits; +} + +static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, + u64 range_end) +{ + struct iomap_folio_state *ifs = folio->private; + + if (*range_start >= range_end) + return 0; + + if (ifs) + return ifs_find_dirty_range(folio, ifs, range_start, range_end); + return range_end - *range_start; +} + static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { @@ -783,12 +821,11 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, out_unlock: __iomap_put_folio(iter, pos, 0, folio); - iomap_write_failed(iter->inode, pos, len); return status; } -static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, +static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); @@ -805,14 +842,14 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, * redo the whole thing. */ if (unlikely(copied < len && !folio_test_uptodate(folio))) - return 0; + return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); - return copied; + return true; } -static size_t iomap_write_end_inline(const struct iomap_iter *iter, +static void iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -827,42 +864,32 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, kunmap_local(addr); mark_inode_dirty(iter->inode); - return copied; } -/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ -static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, +/* + * Returns true if all copied bytes have been written to the pagecache, + * otherwise return false. + */ +static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t old_size = iter->inode->i_size; - size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(iter, folio, pos, copied); - } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, - copied, &folio->page, NULL); - } else { - ret = __iomap_write_end(iter->inode, pos, len, copied, folio); + iomap_write_end_inline(iter, folio, pos, copied); + return true; } - /* - * Update the in-memory inode size after copying the data into the page - * cache. It's up to the file system to write the updated size to disk, - * preferably after I/O completion so that no stale data is exposed. - */ - if (pos + ret > old_size) { - i_size_write(iter->inode, pos + ret); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + size_t bh_written; + + bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, + len, copied, &folio->page, NULL); + WARN_ON_ONCE(bh_written != copied && bh_written != 0); + return bh_written == copied; } - __iomap_put_folio(iter, pos, ret, folio); - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); - if (ret < len) - iomap_write_failed(iter->inode, pos + ret, len - ret); - return ret; + return __iomap_write_end(iter->inode, pos, len, copied, folio); } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) @@ -870,16 +897,18 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) loff_t length = iomap_length(iter); size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; - ssize_t written = 0; + ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; + loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + size_t written; /* Bytes have been written */ bytes = iov_iter_count(i); retry: @@ -909,8 +938,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) } status = iomap_write_begin(iter, pos, bytes, &folio); - if (unlikely(status)) + if (unlikely(status)) { + iomap_write_failed(iter->inode, pos, bytes); break; + } if (iter->iomap.flags & IOMAP_F_STALE) break; @@ -922,19 +953,37 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); - status = iomap_write_end(iter, pos, bytes, copied, folio); + written = iomap_write_end(iter, pos, bytes, copied, folio) ? + copied : 0; + + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's up to the file system to write the + * updated size to disk, preferably after I/O completion so that + * no stale data is exposed. Only once that's done can we + * unlock and release the folio. + */ + old_size = iter->inode->i_size; + if (pos + written > old_size) { + i_size_write(iter->inode, pos + written); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + } + __iomap_put_folio(iter, pos, written, folio); - if (unlikely(copied != status)) - iov_iter_revert(i, copied - status); + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); cond_resched(); - if (unlikely(status == 0)) { + if (unlikely(written == 0)) { /* * A short copy made iomap_write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ + iomap_write_failed(iter->inode, pos, bytes); + iov_iter_revert(i, copied); + if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { @@ -942,17 +991,17 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) goto retry; } } else { - pos += status; - written += status; - length -= status; + pos += written; + total_written += written; + length -= written; } } while (iov_iter_count(i) && length); if (status == -EAGAIN) { - iov_iter_revert(i, written); + iov_iter_revert(i, total_written); return -EAGAIN; } - return written ? written : status; + return total_written ? total_written : status; } ssize_t @@ -1281,6 +1330,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); + bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) @@ -1292,8 +1342,9 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; - bytes = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(bytes == 0)) + ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); + if (WARN_ON_ONCE(!ret)) return -EIO; cond_resched(); @@ -1342,6 +1393,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); + bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (status) @@ -1356,8 +1408,9 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); - bytes = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(bytes == 0)) + ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); + if (WARN_ON_ONCE(!ret)) return -EIO; pos += bytes; @@ -1451,15 +1504,10 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, - size_t len, int error) + size_t len) { struct iomap_folio_state *ifs = folio->private; - if (error) { - folio_set_error(folio); - mapping_set_error(inode->i_mapping, error); - } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); @@ -1476,40 +1524,29 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - loff_t offset = ioend->io_offset; - bool quiet = bio_flagged(bio, BIO_QUIET); + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; u32 folio_count = 0; - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct folio_iter fi; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length, - error); - folio_count++; + if (error) { + mapping_set_error(inode->i_mapping, error); + if (!bio_flagged(bio, BIO_QUIET)) { + pr_err_ratelimited( +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); } - bio_put(bio); } - /* The ioend has been freed by bio_put() */ - if (unlikely(error && !quiet)) { - printk_ratelimited(KERN_ERR -"%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, offset, start); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + if (error) + folio_set_error(fi.folio); + iomap_finish_folio_write(inode, fi.folio, fi.length); + folio_count++; } + + bio_put(bio); /* frees the ioend */ return folio_count; } @@ -1550,7 +1587,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { - if (ioend->io_bio->bi_status != next->io_bio->bi_status) + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) @@ -1615,47 +1652,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; - - iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); + iomap_finish_ioend(iomap_ioend_from_bio(bio), + blk_status_to_errno(bio->bi_status)); } /* * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we've marked pages for writeback - * and unlocked them. In this situation, we need to fail the bio instead of - * submitting it. This typically only happens on a filesystem shutdown. + * the submission process has failed after we've marked pages for writeback. + * We cannot cancel ioend directly in that case, so call the bio end I/O handler + * with the error status here to run the normal I/O completion handler to clear + * the writeback bit and let the file system proess the errors. */ -static int -iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, - int error) +static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) { - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = iomap_writepage_end_bio; + if (!wpc->ioend) + return error; + /* + * Let the file systems prepare the I/O submission and hook in an I/O + * comletion handler. This also needs to happen in case after a + * failure happened so that the file system end I/O handler gets called + * to clean up. + */ if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(ioend, error); + error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (error) { - /* - * If we're failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - ioend->io_bio->bi_status = errno_to_blk_status(error); - bio_endio(ioend->io_bio); - return error; + wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&wpc->ioend->io_bio); + } else { + submit_bio(&wpc->ioend->io_bio); } - submit_bio(ioend->io_bio); - return 0; + wpc->ioend = NULL; + return error; } -static struct iomap_ioend * -iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, - loff_t offset, sector_t sector, struct writeback_control *wbc) +static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, loff_t pos) { struct iomap_ioend *ioend; struct bio *bio; @@ -1663,63 +1699,41 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); - bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); - ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; - ioend->io_folios = 0; - ioend->io_offset = offset; - ioend->io_bio = bio; - ioend->io_sector = sector; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in iomap_finish_ioend(). - */ -static struct bio * -iomap_chain_bio(struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); - bio_clone_blkg_association(new, prev); - new->bi_iter.bi_sector = bio_end_sector(prev); + ioend->io_offset = pos; + ioend->io_sector = bio->bi_iter.bi_sector; - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); - return new; + wpc->nr_folios = 0; + return ioend; } -static bool -iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, - sector_t sector) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; if (wpc->iomap.type != wpc->ioend->io_type) return false; - if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (sector != bio_end_sector(wpc->ioend->io_bio)) + if (iomap_sector(&wpc->iomap, pos) != + bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This * also prevents long tight loops ending page writeback on all the * folios in the ioend. */ - if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + if (wpc->nr_folios >= IOEND_BATCH_SIZE) return false; return true; } @@ -1727,255 +1741,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to * first; otherwise finish off the current ioend and start another. + * + * If a new ioend is created and cached, the old ioend is submitted to the block + * layer instantly. Batching optimisations are provided by higher level block + * plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. */ -static void -iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct list_head *iolist) +static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, loff_t pos, unsigned len) { - sector_t sector = iomap_sector(&wpc->iomap, pos); - unsigned len = i_blocksize(inode); + struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { +new_ioend: + error = iomap_submit_ioend(wpc, 0); + if (error) + return error; + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); + return 0; } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * the forward progress guarantees we need to provide. The current ioend we're - * adding blocks to is cached in the writepage context, and if the new block - * doesn't append to the cached ioend, it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -iomap_writepage_map(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, - struct folio *folio, u64 end_pos) +static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, u64 pos, unsigned dirty_len, + unsigned *count) { - struct iomap_folio_state *ifs = folio->private; - struct iomap_ioend *ioend, *next; - unsigned len = i_blocksize(inode); - unsigned nblocks = i_blocks_per_folio(inode, folio); - u64 pos = folio_pos(folio); - int error = 0, count = 0, i; - LIST_HEAD(submit_list); - - WARN_ON_ONCE(end_pos <= pos); - - if (!ifs && nblocks > 1) { - ifs = ifs_alloc(inode, folio, 0); - iomap_set_range_dirty(folio, 0, end_pos - pos); - } - - WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); + int error; - /* - * Walk through the folio to find areas to write back. If we - * run off the end of the current map or find the current map - * invalid, grab a new one. - */ - for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_dirty(folio, ifs, i)) - continue; + do { + unsigned map_len; - error = wpc->ops->map_blocks(wpc, inode, pos); + error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; - trace_iomap_writepage_map(inode, &wpc->iomap); - if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) - continue; - if (wpc->iomap.type == IOMAP_HOLE) - continue; - iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, - &submit_list); - count++; - } - if (count) - wpc->ioend->io_folios++; + trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); - WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!folio_test_locked(folio)); - WARN_ON_ONCE(folio_test_writeback(folio)); - WARN_ON_ONCE(folio_test_dirty(folio)); + map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + error = -EIO; + break; + case IOMAP_HOLE: + break; + default: + error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, + map_len); + if (!error) + (*count)++; + break; + } + dirty_len -= map_len; + pos += map_len; + } while (dirty_len && !error); /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O * completion to mark the error state of the pages under writeback * appropriately. + * + * Just let the file system know what portion of the folio failed to + * map. */ - if (unlikely(error)) { - /* - * Let the filesystem know what portion of the current page - * failed to map. If the page hasn't been added to ioend, it - * won't be affected by I/O completion and we must unlock it - * now. - */ - if (wpc->ops->discard_folio) - wpc->ops->discard_folio(folio, pos); - if (!count) { - folio_unlock(folio); - goto done; - } - } - - /* - * We can have dirty bits set past end of file in page_mkwrite path - * while mapping the last partial folio. Hence it's better to clear - * all the dirty bits in the folio here. - */ - iomap_clear_range_dirty(folio, 0, folio_size(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - - /* - * Preserve the original error if there was one; catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = iomap_submit_ioend(wpc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - folio_end_writeback(folio); -done: - mapping_set_error(inode->i_mapping, error); + if (error && wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); return error; } /* - * Write out a dirty page. + * Check interaction of the folio with the file end. * - * For delalloc space on the page, we need to allocate space and flush it. - * For unwritten space on the page, we need to start the conversion to - * regular allocated space. + * If the folio is entirely beyond i_size, return false. If it straddles + * i_size, adjust end_pos and zero all data beyond i_size. */ -static int iomap_do_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) +static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, + u64 *end_pos) { - struct iomap_writepage_ctx *wpc = data; - struct inode *inode = folio->mapping->host; - u64 end_pos, isize; - - trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); - - /* - * Refuse to write the folio out if we're called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; + u64 isize = i_size_read(inode); - /* - * Is this folio beyond the end of the file? - * - * The folio index is less than the end_index, adjust the end_pos - * to the highest offset that this folio should represent. - * ----------------------------------------------------- - * | file mapping | | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - isize = i_size_read(inode); - end_pos = folio_pos(folio) + folio_size(folio); - if (end_pos > isize) { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ + if (*end_pos > isize) { size_t poff = offset_in_folio(folio, isize); pgoff_t end_index = isize >> PAGE_SHIFT; /* - * Skip the page if it's fully outside i_size, e.g. - * due to a truncate operation that's in progress. We've - * cleaned this page and truncate will finish things off for - * us. + * If the folio is entirely ouside of i_size, skip it. + * + * This can happen due to a truncate operation that is in + * progress and in that case truncate will finish it off once + * we've dropped the folio lock. * - * Note that the end_index is unsigned long. If the given - * offset is greater than 16TB on a 32-bit system then if we - * checked if the page is fully outside i_size with - * "if (page->index >= end_index + 1)", "end_index + 1" would - * overflow and evaluate to 0. Hence this page would be + * Note that the pgoff_t used for end_index is an unsigned long. + * If the given offset is greater than 16TB on a 32-bit system, + * then if we checked if the folio is fully outside i_size with + * "if (folio->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this folio would be * redirtied and written out repeatedly, which would result in * an infinite loop; the user program performing this operation * would hang. Instead, we can detect this situation by - * checking if the page is totally beyond i_size or if its + * checking if the folio is totally beyond i_size or if its * offset is just equal to the EOF. */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) - goto unlock; + return false; /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." + * The folio straddles i_size. + * + * It must be zeroed out on each and every writepage invocation + * because it may be mmapped: + * + * A file is mapped in multiples of the page size. For a + * file that is not a multiple of the page size, the + * remaining memory is zeroed when mapped, and writes to that + * region are not written out to the file. + * + * Also adjust the writeback range to skip all blocks entirely + * beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - end_pos = isize; + *end_pos = round_up(isize, i_blocksize(inode)); } - return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); + return true; +} + +static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + u64 pos = folio_pos(folio); + u64 end_pos = pos + folio_size(folio); + unsigned count = 0; + int error = 0; + u32 rlen; -redirty: - folio_redirty_for_writepage(wbc, folio); -unlock: + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + + trace_iomap_writepage(inode, pos, folio_size(folio)); + + if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { + folio_unlock(folio); + return 0; + } + WARN_ON_ONCE(end_pos <= pos); + + if (i_blocks_per_folio(inode, folio) > 1) { + if (!ifs) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + /* + * Keep the I/O completion handler from clearing the writeback + * bit until we have submitted all blocks by adding a bias to + * ifs->write_bytes_pending, which is dropped after submitting + * all blocks. + */ + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + atomic_inc(&ifs->write_bytes_pending); + } + + /* + * Set the writeback bit ASAP, as the I/O completion for the single + * block per folio case happen hit as soon as we're submitting the bio. + */ + folio_start_writeback(folio); + + /* + * Walk through the folio to find dirty areas to write back. + */ + while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, + pos, rlen, &count); + if (error) + break; + pos += rlen; + } + + if (count) + wpc->nr_folios++; + + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); + + /* + * Usually the writeback bit is cleared by the I/O completion handler. + * But we may end up either not actually writing any blocks, or (when + * there are multiple blocks in a folio) all I/O might have finished + * already at this point. In that case we need to clear the writeback + * bit ourselves right after unlocking the page. + */ folio_unlock(folio); - return 0; + if (ifs) { + if (atomic_dec_and_test(&ifs->write_bytes_pending)) + folio_end_writeback(folio); + } else { + if (!count) + folio_end_writeback(folio); + } + mapping_set_error(inode->i_mapping, error); + return error; +} + +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) +{ + return iomap_writepage_map(data, wbc, folio); } int @@ -1985,18 +1982,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, { int ret; + /* + * Writeback from reclaim context should never happen except in the case + * of a VM regression so warn about it and refuse to write the data. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == + PF_MEMALLOC)) + return -EIO; + wpc->ops = ops; ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); - if (!wpc->ioend) - return ret; - return iomap_submit_ioend(wpc, wpc->ioend, ret); + return iomap_submit_ioend(wpc, ret); } EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_inline_bio), + offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_init); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index c16fd55f5595c2984c24ddf77002eab739eeffc8..3ef694f9489f02080f44e931a838bf0208810059 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -DEFINE_IOMAP_EVENT(iomap_writepage_map); + +TRACE_EVENT(iomap_writepage_map, + TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, + struct iomap *iomap), + TP_ARGS(inode, pos, dirty_len, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, pos) + __field(u64, dirty_len) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->dirty_len = dirty_len; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " + "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->pos, + __entry->dirty_len, + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fa6bf3e9583ae8972ab3edab010d93c6a7d4ff70..8f2cb0209c3c9003115f53b1db584b871425b6db 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4502,8 +4502,8 @@ xfs_bmapi_write( * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4609,7 +4610,8 @@ xfs_bmapi_convert_delalloc( ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); @@ -4632,6 +4634,36 @@ xfs_bmapi_convert_delalloc( return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 465d7630bb2185d878b21d9968131a92d66d1ee8..a2e1f73f84ddf9bdab33724f5edaf3faffb9dd04 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -233,50 +233,12 @@ xfs_imap_valid( return true; } -/* - * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->iomap. - * - * The current page is held locked so nothing could have removed the block - * backing offset_fsb, although it could have moved from the COW to the data - * fork by another thread. - */ -static int -xfs_convert_blocks( - struct iomap_writepage_ctx *wpc, - struct xfs_inode *ip, - int whichfork, - loff_t offset) -{ - int error; - unsigned *seq; - - if (whichfork == XFS_COW_FORK) - seq = &XFS_WPC(wpc)->cow_seq; - else - seq = &XFS_WPC(wpc)->data_seq; - - /* - * Attempt to allocate whatever delalloc extent currently backs offset - * and put the result into wpc->iomap. Allocate in a loop because it - * may take several attempts to allocate real blocks for a contiguous - * delalloc extent if free space is sufficiently fragmented. - */ - do { - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, - &wpc->iomap, seq); - if (error) - return error; - } while (wpc->iomap.offset + wpc->iomap.length <= offset); - - return 0; -} - static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -289,6 +251,7 @@ xfs_map_blocks( struct xfs_iext_cursor icur; int retries = 0; int error = 0; + unsigned int *seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -386,7 +349,19 @@ xfs_map_blocks( trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, whichfork, offset); + /* + * Convert a dellalloc extent to a real one. The current page is held + * locked so nothing could have removed the block backing offset_fsb, + * although it could have moved from the COW to the data fork by another + * thread. + */ + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; + + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -444,7 +419,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b1532d043b76d201fff199a936c0cf0d..334860f780ff332ac8f1c4603a01fe328f709873 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1035,6 +1035,24 @@ xfs_buffered_write_iomap_begin( } if (imap.br_startoff <= offset_fsb) { + /* + * Trim a delalloc extent that extends beyond the EOF block. + * If it starts beyond the EOF block, convert it to an unwritten + * extent. + */ + if ((flags & IOMAP_ZERO) && + isnullstartblock(imap.br_startblock)) { + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + if (offset_fsb >= eof_fsb) + goto convert_delay; + if (end_fsb > eof_fsb) { + end_fsb = eof_fsb; + xfs_trim_extent(&imap, offset_fsb, + end_fsb - offset_fsb); + } + } + /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of @@ -1149,15 +1167,26 @@ xfs_buffered_write_iomap_begin( * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); +convert_delay: + xfs_iunlock(ip, lockmode); + truncate_pagecache(inode, offset); + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, + iomap, NULL); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); + return 0; + found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { @@ -1165,17 +1194,17 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 897b12ec61e29ce418aa8d27e69352474fae7bab..e66de2191245ddfbfd8e13baff24045fdc0b97f5 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac) * which implies that the page range can only be within the fixed inode size. */ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, + unsigned int len) { struct zonefs_zone *z = zonefs_inode_zone(inode); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 96dd0acbba44aca735ff027ffb8f1c118cb762e8..6fc1c858013d1e4dda4ed38fa4083acf25d16d36 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -293,22 +293,32 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ - u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ + struct bio io_bio; /* MUST BE LAST! */ }; +static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) +{ + return container_of(bio, struct iomap_ioend, io_bio); +} + struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. + * + * Can return arbitrarily large regions, but we need to call into it at + * least once per folio to allow the file systems to synchronize with + * the write path that could be invalidating mappings. + * + * An existing mapping from a previous call to this method can be reused + * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset); + loff_t offset, unsigned len); /* * Optional, allows the file systems to perform actions just before @@ -329,6 +339,7 @@ struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; + u32 nr_folios; /* folios added to the ioend */ }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d418f1c563ee5687572df931502f1a2c5c7ac613..89f6e5fb56d2e0e76c06f3c31e5287b047acfaeb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -344,6 +344,20 @@ static inline void mapping_set_large_folios(struct address_space *mapping) __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/** + * mapping_clear_large_folios() - The file disable supports large folios. + * @mapping: The file. + * + * The filesystem have to make sure the file is in atomic context and all + * cached folios have been cleared under mapping->invalidate_lock before + * calling this function. + */ +static inline void mapping_clear_large_folios(struct address_space *mapping) +{ + WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock)); + __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 65029dfb92fbc3162c30d0a85a6805afa3ab335e..588991b57c127e804f1fecfba3ea861777eeebc2 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1249,14 +1249,15 @@ TRACE_EVENT(ext4_da_update_reserve_space, ); TRACE_EVENT(ext4_da_reserve_space, - TP_PROTO(struct inode *inode), + TP_PROTO(struct inode *inode, int nr_resv), - TP_ARGS(inode), + TP_ARGS(inode, nr_resv), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) + __field( int, reserve_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), @@ -1265,16 +1266,17 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; + __entry->reserve_blocks = nr_resv; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d" "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, - __entry->reserved_data_blocks) + __entry->reserve_blocks, __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, @@ -2184,6 +2186,7 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) + __field( unsigned int, seq ) ), TP_fast_assign( @@ -2193,13 +2196,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, - __entry->pblk, show_extent_status(__entry->status)) + __entry->pblk, show_extent_status(__entry->status), + __entry->seq) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, @@ -2224,6 +2229,7 @@ TRACE_EVENT(ext4_es_remove_extent, __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) + __field( unsigned int, seq ) ), TP_fast_assign( @@ -2231,12 +2237,13 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len) + __entry->lblk, __entry->len, __entry->seq) ); TRACE_EVENT(ext4_es_find_extent_range_enter, @@ -2481,11 +2488,11 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); -TRACE_EVENT(ext4_es_insert_delayed_block, +TRACE_EVENT(ext4_es_insert_delayed_extent, TP_PROTO(struct inode *inode, struct extent_status *es, - bool allocated), + bool lclu_allocated, bool end_allocated), - TP_ARGS(inode, es, allocated), + TP_ARGS(inode, es, lclu_allocated, end_allocated), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2494,7 +2501,9 @@ TRACE_EVENT(ext4_es_insert_delayed_block, __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) - __field( bool, allocated ) + __field( bool, lclu_allocated ) + __field( bool, end_allocated ) + __field( unsigned int, seq ) ), TP_fast_assign( @@ -2504,16 +2513,19 @@ TRACE_EVENT(ext4_es_insert_delayed_block, __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); - __entry->allocated = allocated; + __entry->lclu_allocated = lclu_allocated; + __entry->end_allocated = end_allocated; + __entry->seq = EXT4_I(inode)->i_es_seq; ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " - "allocated %d", + "allocated %d %d seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), - __entry->allocated) + __entry->lclu_allocated, __entry->end_allocated, + __entry->seq) ); /* fsmap traces */ diff --git a/mm/readahead.c b/mm/readahead.c index 4d0dbfd62d2039cc6f04a92dea387078ca26777a..63c7320ba464f50ba5e9571f40e1a650ec2a77cf 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -510,6 +510,12 @@ void page_cache_ra_order(struct readahead_control *ractl, } filemap_invalidate_lock_shared(mapping); + + if (unlikely(!mapping_large_folio_support(mapping))) { + filemap_invalidate_unlock_shared(mapping); + goto fallback; + } + while (index <= limit) { unsigned int order = new_order;