diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index bd12f2f850ad3a863f495543d95de8174bb4a727..d5bf4b6b7509b01c9a2d5225a6bb5b2e1ef327b2 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -264,7 +264,7 @@ prototypes:: struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -290,7 +290,7 @@ direct_IO: migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes -error_remove_page: yes +error_remove_folio: yes swap_activate: no swap_deactivate: no swap_rw: yes, unlocks diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 99acc2e9867391a917269bfb4a5fb0a3f940bc7e..dd99ce5912d85967581ca382e35df112af4ed6e1 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -823,7 +823,7 @@ cache in your filesystem. The following members are defined: bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*error_remove_folio)(struct mapping *mapping, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -1034,8 +1034,8 @@ cache in your filesystem. The following members are defined: VM if a folio should be treated as dirty or writeback for the purposes of stalling. -``error_remove_page`` - normally set to generic_error_remove_page if truncation is ok +``error_remove_folio`` + normally set to generic_error_remove_folio if truncation is ok for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. diff --git a/block/fops.c b/block/fops.c index 6d0ac712846180b5e6b19508e7d0b2bb76259ffa..5c7c486f7e0b3597030a5382cf46f6b454752684 100644 --- a/block/fops.c +++ b/block/fops.c @@ -500,7 +500,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ @@ -631,7 +631,7 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); } /* diff --git a/fs/afs/write.c b/fs/afs/write.c index 948db2be26ec339a26839e55db0db67fac30c7bd..f4396bf39c0f28af4892c4cbe0f832089616a16b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping, folio_clear_uptodate(folio); folio_end_writeback(folio); folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb9319d856f2d8b090d4320168e14be4527975f1..00314fbe531f66b81187dfb1dfdf59b0a7dff4bf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10987,7 +10987,7 @@ static const struct address_space_operations btrfs_aops = { .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2c92de964c5a2ee55cc82385aac63e3cb8b34352..c7b403dfa05f649b63ab456625a6f09c4573ed96 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -899,8 +899,8 @@ static void writepages_finish(struct ceph_osd_request *req) dout("unlocking %p\n", page); if (remove_page) - generic_error_remove_page(inode->i_mapping, - page); + generic_error_remove_folio(inode->i_mapping, + page_folio(page)); unlock_page(page); } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 314b415ee51860ebca5f8e49507bfcd4e0883029..df6278cee499aaf73eed9d86507088a2ac72ed52 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations ext2_dax_aops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e8e323d2daaee60359661b420151d2376452daf0..bc336b74a6381c237e45cd9711ad380244acf96b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3619,7 +3619,7 @@ static const struct address_space_operations ext4_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3636,7 +3636,7 @@ static const struct address_space_operations ext4_journalled_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3653,7 +3653,7 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index f7ef69f44f3d844b3774797e788fa0d6b48b2147..0f969b013bac0cf4bdc12b73714616e7adbafd5d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1971,7 +1971,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7ad4a9241759178a5d31c705e6539074e2f455b0..535a201f6bd17bee71ffdb25ef5268a731140a7a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -585,7 +585,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; /* - * generic_error_remove_page only truncates pages of regular + * generic_error_remove_folio only truncates pages of regular * inode */ inode->i_mode |= S_IFREG; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6097db9a7ebf3da1720f75aa669921ac57a5e86b..0495d14807014b62776e1dda2872ea81d81c63e2 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -750,7 +750,7 @@ static const struct address_space_operations gfs2_aops = { .bmap = gfs2_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -763,7 +763,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidate_folio = gfs2_invalidate_folio, .release_folio = gfs2_release_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9296e0e282bcd8d39d697234e8bcdc3702ed3f4f..3b24e54d9aed8901d12b9a02f0093918155f698c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1056,7 +1056,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, } pagefault_disable(); - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL); pagefault_enable(); if (ret > 0) written += ret; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 36ac4e536bcb86df0e78efc0e90ded18cbb00fb3..1831ac39eb9e714b3adb3e3190cff1a2e3890853 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1166,8 +1166,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, #define hugetlbfs_migrate_folio NULL #endif -static int hugetlbfs_error_remove_page(struct address_space *mapping, - struct page *page) +static int hugetlbfs_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -1325,7 +1325,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, - .error_remove_page = hugetlbfs_error_remove_page, + .error_remove_folio = hugetlbfs_error_remove_folio, }; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 509ae770e07a7caf2b75f94975e79f3112170659..990483e22fbd472e23cb5f3e21d6f332057369e1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -23,7 +23,6 @@ #define IOEND_BATCH_SIZE 4096 -typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -824,12 +823,11 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, out_unlock: __iomap_put_folio(iter, pos, 0, folio); - iomap_write_failed(iter->inode, pos, len); return status; } -static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, +static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); @@ -846,14 +844,14 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, * redo the whole thing. */ if (unlikely(copied < len && !folio_test_uptodate(folio))) - return 0; + return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); - return copied; + return true; } -static size_t iomap_write_end_inline(const struct iomap_iter *iter, +static void iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -868,49 +866,39 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, kunmap_local(addr); mark_inode_dirty(iter->inode); - return copied; } -/* Returns the number of bytes copied. May be 0. Cannot be an errno. */ -static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, +/* + * Returns true if all copied bytes have been written to the pagecache, + * otherwise return false. + */ +static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t old_size = iter->inode->i_size; - size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(iter, folio, pos, copied); - } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, - copied, &folio->page, NULL); - } else { - ret = __iomap_write_end(iter->inode, pos, len, copied, folio); + iomap_write_end_inline(iter, folio, pos, copied); + return true; } - /* - * Update the in-memory inode size after copying the data into the page - * cache. It's up to the file system to write the updated size to disk, - * preferably after I/O completion so that no stale data is exposed. - */ - if (pos + ret > old_size) { - i_size_write(iter->inode, pos + ret); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + size_t bh_written; + + bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, + len, copied, &folio->page, NULL); + WARN_ON_ONCE(bh_written != copied && bh_written != 0); + return bh_written == copied; } - __iomap_put_folio(iter, pos, ret, folio); - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); - if (ret < len) - iomap_write_failed(iter->inode, pos + ret, len - ret); - return ret; + return __iomap_write_end(iter->inode, pos, len, copied, folio); } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); loff_t pos = iter->pos; - ssize_t written = 0; + ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); @@ -918,9 +906,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; + loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + size_t written; /* Bytes have been written */ bytes = iov_iter_count(i); retry: @@ -950,8 +940,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) } status = iomap_write_begin(iter, pos, bytes, &folio); - if (unlikely(status)) + if (unlikely(status)) { + iomap_write_failed(iter->inode, pos, bytes); break; + } if (iter->iomap.flags & IOMAP_F_STALE) break; @@ -963,19 +955,37 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); - status = iomap_write_end(iter, pos, bytes, copied, folio); + written = iomap_write_end(iter, pos, bytes, copied, folio) ? + copied : 0; - if (unlikely(copied != status)) - iov_iter_revert(i, copied - status); + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's up to the file system to write the + * updated size to disk, preferably after I/O completion so that + * no stale data is exposed. Only once that's done can we + * unlock and release the folio. + */ + old_size = iter->inode->i_size; + if (pos + written > old_size) { + i_size_write(iter->inode, pos + written); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + } + __iomap_put_folio(iter, pos, written, folio); + + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); cond_resched(); - if (unlikely(status == 0)) { + if (unlikely(written == 0)) { /* * A short copy made iomap_write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ + iomap_write_failed(iter->inode, pos, bytes); + iov_iter_revert(i, copied); + if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { @@ -983,28 +993,29 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) goto retry; } } else { - pos += status; - written += status; - length -= status; + pos += written; + total_written += written; + length -= written; } } while (iov_iter_count(i) && length); if (status == -EAGAIN) { - iov_iter_revert(i, written); + iov_iter_revert(i, total_written); return -EAGAIN; } - return written ? written : status; + return total_written ? total_written : status; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(i), .flags = IOMAP_WRITE, + .private = private, }; ssize_t ret; @@ -1024,15 +1035,14 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static int iomap_write_delalloc_ifs_punch(struct inode *inode, +static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { unsigned int first_blk, last_blk, i; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; - int ret = 0; /* * When we have per-block dirty tracking, there can be @@ -1042,47 +1052,35 @@ static int iomap_write_delalloc_ifs_punch(struct inode *inode, */ ifs = folio->private; if (!ifs) - return ret; + return; last_byte = min_t(loff_t, end_byte - 1, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) { - ret = punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits); - if (ret) - return ret; - } + if (!ifs_block_is_dirty(folio, ifs, i)) + punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits, iomap); } - - return ret; } - -static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, +static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { - int ret = 0; - if (!folio_test_dirty(folio)) - return ret; + return; /* if dirty, punch up to offset */ if (start_byte > *punch_start_byte) { - ret = punch(inode, *punch_start_byte, - start_byte - *punch_start_byte); - if (ret) - return ret; + punch(inode, *punch_start_byte, start_byte - *punch_start_byte, + iomap); } /* Punch non-dirty blocks within folio */ - ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, - end_byte, punch); - if (ret) - return ret; + iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, + iomap, punch); /* * Make sure the next punch start is correctly bound to @@ -1090,8 +1088,6 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, */ *punch_start_byte = min_t(loff_t, end_byte, folio_pos(folio) + folio_size(folio)); - - return ret; } /* @@ -1111,13 +1107,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, * This function uses [start_byte, end_byte) intervals (i.e. open ended) to * simplify range iterations. */ -static int iomap_write_delalloc_scan(struct inode *inode, +static void iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - iomap_punch_t punch) + struct iomap *iomap, iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; - int ret; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, @@ -1128,27 +1123,47 @@ static int iomap_write_delalloc_scan(struct inode *inode, continue; } - ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, - start_byte, end_byte, punch); - if (ret) { - folio_unlock(folio); - folio_put(folio); - return ret; - } + iomap_write_delalloc_punch(inode, folio, punch_start_byte, + start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ start_byte = folio_next_index(folio) << PAGE_SHIFT; folio_unlock(folio); folio_put(folio); } - return 0; } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1177,20 +1192,21 @@ static int iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static int iomap_write_delalloc_release(struct inode *inode, - loff_t start_byte, loff_t end_byte, iomap_punch_t punch) +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, + loff_t end_byte, unsigned flags, struct iomap *iomap, + iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); - int error = 0; /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1199,13 +1215,15 @@ static int iomap_write_delalloc_release(struct inode *inode, /* * If there is no more data to scan, all that is left is to * punch out the remaining range. + * + * Note that mapping_seek_hole_data is only supposed to return + * either an offset or -ENXIO, so WARN on any other error as + * that would be an API change without updating the callers. */ if (start_byte == -ENXIO || start_byte == scan_end_byte) break; - if (start_byte < 0) { - error = start_byte; - goto out_unlock; - } + if (WARN_ON_ONCE(start_byte < 0)) + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1215,10 +1233,8 @@ static int iomap_write_delalloc_release(struct inode *inode, */ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); - if (data_end < 0) { - error = data_end; - goto out_unlock; - } + if (WARN_ON_ONCE(data_end < 0)) + return; /* * If we race with post-direct I/O invalidation of the page cache, @@ -1230,87 +1246,18 @@ static int iomap_write_delalloc_release(struct inode *inode, WARN_ON_ONCE(data_end < start_byte); WARN_ON_ONCE(data_end > scan_end_byte); - error = iomap_write_delalloc_scan(inode, &punch_start_byte, - start_byte, data_end, punch); - if (error) - goto out_unlock; + iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte, + data_end, iomap, punch); /* The next data search starts at the end of this one. */ start_byte = data_end; } if (punch_start_byte < end_byte) - error = punch(inode, punch_start_byte, - end_byte - punch_start_byte); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); - return error; -} - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -int iomap_file_buffered_write_punch_delalloc(struct inode *inode, - struct iomap *iomap, loff_t pos, loff_t length, - ssize_t written, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return 0; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return 0; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return 0; - - return iomap_write_delalloc_release(inode, start_byte, end_byte, - punch); + punch(inode, punch_start_byte, end_byte - punch_start_byte, + iomap); } -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { @@ -1327,6 +1274,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); + bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) @@ -1338,8 +1286,9 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; - bytes = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(bytes == 0)) + ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); + if (WARN_ON_ONCE(!ret)) return -EIO; cond_resched(); @@ -1392,6 +1341,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); + bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (status) @@ -1406,8 +1356,9 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); - bytes = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(bytes == 0)) + ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); + if (WARN_ON_ONCE(!ret)) return -EIO; pos += bytes; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3f9768810427df609c74cf5a077e6748cc5019ce..e8cccb94b9277e90ae057c1f178cf064ad6b7189 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,7 @@ const struct address_space_operations nfs_file_aops = { .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, .swap_rw = nfs_swap_rw, diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 4e158bce419295d84b3d71a64d2c9a3df4949314..fea577007f4f24dec1d641d07d34f44060034f63 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1661,7 +1661,7 @@ const struct address_space_operations ntfs_normal_aops = { .bmap = ntfs_bmap, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1675,7 +1675,7 @@ const struct address_space_operations ntfs_compressed_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; #ifdef NTFS_RW diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 315f7c2f6a02c159b66580ab0323108c1845e770..030fa248864928df19dadc8bc466a1d46f32ba35 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2483,5 +2483,5 @@ const struct address_space_operations ocfs2_aops = { .release_folio = ocfs2_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 82f18f28c1c1a976027f39fe9ba88fa60bdd5ec7..a50f443ad1fd85a41ba3611052c434e27f29c227 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -116,7 +116,7 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, offset, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); } goto done; @@ -457,7 +457,7 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - error = xfs_bmap_punch_delalloc_range(ip, pos, + error = xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, folio_pos(folio) + folio_size(folio)); if (error && !xfs_is_shutdown(mp)) @@ -552,7 +552,7 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index b7a0ca7ca4e6e96e94fd666aa3e5988fd1b32f90..92ee94397fa67e93b24089d53fac858e0b479c4d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,12 @@ xfs_getbmap( int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, + int whichfork, xfs_off_t start_byte, xfs_off_t end_byte) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; @@ -623,12 +624,14 @@ xfs_bmap_punch_delalloc_range( continue; } - error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, + error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); if (error || !xfs_iext_get_extent(ifp, &icur, &got)) break; } + if (whichfork == XFS_COW_FORK && !ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -740,7 +743,7 @@ xfs_free_eofblocks( */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_delayed_blks) { - xfs_bmap_punch_delalloc_range(ip, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), LLONG_MAX); } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 1383019ccdb755f6f055ea264fed416b27aa4cd5..06dd0915fc67576c4710e6eb1a1f9a5c626eb697 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d04302d85823729610beb5add669e1ef302420ac..79cbdb482f8e9e52a3801b83ee31c945bc3877fd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -364,10 +364,83 @@ xfs_file_splice_read( return ret; } +/* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -377,13 +450,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -406,7 +476,7 @@ xfs_file_write_checks( * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -417,64 +487,24 @@ xfs_file_write_checks( } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. - * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. + * write. * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } @@ -777,7 +807,7 @@ xfs_file_buffered_write( trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); /* * If we hit a space limit, try to free up some lingering preallocated diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9ce2f48b4ebc01005d675e1bf8196a9b616d6bc3..0e36cd0e1247e3262ffcfad0823373730cc8b3b0 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -956,6 +956,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + unsigned int iomap_flags = 0; u64 seq; if (xfs_is_shutdown(mp)) @@ -1127,6 +1128,11 @@ xfs_buffered_write_iomap_begin( } } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap_flags |= IOMAP_F_NEW; if (allocfork == XFS_COW_FORK) { error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, end_fsb - offset_fsb, prealloc_blocks, &cmap, @@ -1144,19 +1150,11 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); - found_imap: - seq = xfs_iomap_inode_sequence(ip, 0); + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); convert_delay: xfs_iunlock(ip, lockmode); @@ -1170,34 +1168,37 @@ xfs_buffered_write_iomap_begin( return 0; found_cow: - seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); if (error) goto out_unlock; - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED, seq); + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); } - xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + iomap_flags |= IOMAP_F_SHARED; + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; } -static int +static void xfs_buffered_write_delalloc_punch( struct inode *inode, loff_t offset, - loff_t length) + loff_t length, + struct iomap *iomap) { - return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, - offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), + (iomap->flags & IOMAP_F_SHARED) ? + XFS_COW_FORK : XFS_DATA_FORK, + offset, offset + length); } static int @@ -1209,17 +1210,30 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { + loff_t start_byte, end_byte; - struct xfs_mount *mp = XFS_M(inode->i_sb); - int error; + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) + return 0; - error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, - length, written, &xfs_buffered_write_delalloc_punch); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino 0x%llx", - __func__, XFS_I(inode)->i_ino); - return error; + /* Nothing to do if we've written the entire delalloc extent */ + start_byte = iomap_last_written_block(inode, offset, written); + end_byte = round_up(offset + length, i_blocksize(inode)); + if (start_byte >= end_byte) + return 0; + + /* For zeroing operations the callers already hold invalidate_lock. */ + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { + rwsem_assert_held_write(&inode->i_mapping->invalidate_lock); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + } else { + filemap_invalidate_lock(inode->i_mapping); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + filemap_invalidate_unlock(inode->i_mapping); } + return 0; } @@ -1426,6 +1440,8 @@ xfs_zero_range( { struct inode *inode = VFS_I(ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL)); + if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index e66de2191245ddfbfd8e13baff24045fdc0b97f5..35166c92420c481737f08f6ec27b8d6e6d9fc71a 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -181,7 +181,7 @@ const struct address_space_operations zonefs_file_aops = { .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = zonefs_swap_activate, }; @@ -563,7 +563,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); if (ret == -EIO) zonefs_io_error(inode, true); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ab9e7bfb808453d97313437e2b82204d33c9d6d..9344646389497b82c3968d38ad5f4132fef99118 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,7 +429,7 @@ struct address_space_operations { bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 958e177f53020a83509a0a4f387bdd162112f9a4..524fd1d5c5e9144161f83992dbd46c25ae828cce 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -276,12 +276,22 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) iter->srcmap.type == IOMAP_MAPPED; } -ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, - const struct iomap_ops *ops); -int iomap_file_buffered_write_punch_delalloc(struct inode *inode, - struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)); +/* + * Return the file offset for the first unchanged block after a short write. + * + * If nothing was written, round @pos down to point at the first block in + * the range, else round up to include the partially written block. + */ +static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos, + ssize_t written) +{ + if (unlikely(!written)) + return round_down(pos, i_blocksize(inode)); + return round_up(pos + written, i_blocksize(inode)); +} +ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, + const struct iomap_ops *ops, void *private); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); @@ -297,6 +307,13 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); + +typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, + struct iomap *iomap); +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, + loff_t end_byte, unsigned flags, struct iomap *iomap, + iomap_punch_t punch); + int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, diff --git a/include/linux/mm.h b/include/linux/mm.h index bf62c528a09a771f83c09dfe3ad35319e54153cc..577fbdae99882ae39f48d39bc37ffb7133c8fc63 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2445,7 +2445,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int generic_error_remove_page(struct address_space *mapping, struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 1d264dd086250e74b43df78a1266d3eb5219eba0..29c4e4f243e47d580945626da4a172e2ecff0c5b 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -26,12 +26,17 @@ struct rwbase_rt { } while (0) -static __always_inline bool rw_base_is_locked(struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) != READER_BIAS; } -static __always_inline bool rw_base_is_contended(struct rwbase_rt *rwb) +static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +{ + WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); +} + +static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) > 0; } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9c29689ff505e09854c93eb449665a31145e3df2..4f1c18992f768fe67faffa139f259e8213a93f9e 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -66,14 +66,24 @@ struct rw_semaphore { #endif }; -/* In all implementations count != 0 means locked */ +#define RWSEM_UNLOCKED_VALUE 0UL +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { - return atomic_long_read(&sem->count) != 0; + return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } -#define RWSEM_UNLOCKED_VALUE 0L -#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); +} /* Common initializer macros and functions */ @@ -152,11 +162,21 @@ do { \ __init_rwsem((sem), #sem, &__key); \ } while (0) -static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) +static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!rwsem_is_locked(sem)); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + rw_base_assert_held_write(sem); +} + static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); @@ -169,6 +189,22 @@ static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) * the RT specific variant. */ +static inline void rwsem_assert_held(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held(sem); + else + rwsem_assert_held_nolockdep(sem); +} + +static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held_write(sem); + else + rwsem_assert_held_write_nolockdep(sem); +} + /* * lock for reading */ diff --git a/mm/internal.h b/mm/internal.h index 4d116823a6f31c8a76d2dccd6f755bef042e173f..6cd124cd175a8c6764e63a8b7a503a8a1cfc5dac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -368,6 +368,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); +long mapping_evict_folio(struct address_space *mapping, struct folio *folio); long invalidate_inode_page(struct page *page); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c2acb4f9bac3f344408c3c8aac8ed76cf9bbd940..718fdc612971a32784c50ddaa880a6af865acbf0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -919,39 +919,38 @@ static const char * const action_page_types[] = { * The page count will stop it from being freed by unpoison. * Stress tests should be aware of this memory leak problem. */ -static int delete_from_lru_cache(struct page *p) +static int delete_from_lru_cache(struct folio *folio) { - if (isolate_lru_page(p)) { + if (folio_isolate_lru(folio)) { /* * Clear sensible page flags, so that the buddy system won't - * complain when the page is unpoison-and-freed. + * complain when the folio is unpoison-and-freed. */ - ClearPageActive(p); - ClearPageUnevictable(p); + folio_clear_active(folio); + folio_clear_unevictable(folio); /* * Poisoned page might never drop its ref count to 0 so we have * to uncharge it manually from its memcg. */ - mem_cgroup_uncharge(page_folio(p)); + mem_cgroup_uncharge(folio); /* - * drop the page count elevated by isolate_lru_page() + * drop the refcount elevated by folio_isolate_lru() */ - put_page(p); + folio_put(folio); return 0; } return -EIO; } -static int truncate_error_page(struct page *p, unsigned long pfn, +static int truncate_error_folio(struct folio *folio, unsigned long pfn, struct address_space *mapping) { int ret = MF_FAILED; - if (mapping->a_ops->error_remove_page) { - struct folio *folio = page_folio(p); - int err = mapping->a_ops->error_remove_page(mapping, p); + if (mapping->a_ops->error_remove_folio) { + int err = mapping->a_ops->error_remove_folio(mapping, folio); if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); @@ -964,7 +963,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, * If the file system doesn't support it just invalidate * This fails on dirty or anything with private pages */ - if (invalidate_inode_page(p)) + if (mapping_evict_folio(mapping, folio)) ret = MF_RECOVERED; else pr_info("%#lx: Failed to invalidate\n", pfn); @@ -1031,17 +1030,18 @@ static int me_unknown(struct page_state *ps, struct page *p) */ static int me_pagecache_clean(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; struct address_space *mapping; bool extra_pins; - delete_from_lru_cache(p); + delete_from_lru_cache(folio); /* - * For anonymous pages we're done the only reference left + * For anonymous folios the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) { + if (folio_test_anon(folio)) { ret = MF_RECOVERED; goto out; } @@ -1053,11 +1053,9 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * has a reference, because it could be file system metadata * and that's not safe to truncate. */ - mapping = page_mapping(p); + mapping = folio_mapping(folio); if (!mapping) { - /* - * Page has been teared down in the meanwhile - */ + /* Folio has been torn down in the meantime */ ret = MF_FAILED; goto out; } @@ -1073,12 +1071,12 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(p, page_to_pfn(p), mapping); + ret = truncate_error_folio(folio, page_to_pfn(p), mapping); if (has_extra_refcount(ps, p, extra_pins)) ret = MF_FAILED; out: - unlock_page(p); + folio_unlock(folio); return ret; } @@ -1156,15 +1154,16 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p) */ static int me_swapcache_dirty(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; bool extra_pins = false; - ClearPageDirty(p); + folio_clear_dirty(folio); /* Trigger EIO in shmem: */ - ClearPageUptodate(p); + folio_clear_uptodate(folio); - ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; - unlock_page(p); + ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED; + folio_unlock(folio); if (ret == MF_DELAYED) extra_pins = true; @@ -1182,7 +1181,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) delete_from_swap_cache(folio); - ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); if (has_extra_refcount(ps, p, false)) @@ -1206,7 +1205,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = folio_mapping(folio); if (mapping) { - res = truncate_error_page(&folio->page, page_to_pfn(p), mapping); + res = truncate_error_folio(folio, page_to_pfn(p), mapping); /* The page is kept in page cache. */ extra_pins = true; folio_unlock(folio); diff --git a/mm/shmem.c b/mm/shmem.c index 247b20ae91af62c7e8fde246946a5b926cc5b6ff..8d11b93b1ce7a2f9ba09c0b0405612c03eb7637c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4976,8 +4976,8 @@ static void shmem_destroy_inodecache(void) } /* Keep the page in page cache instead of truncating it */ -static int shmem_error_remove_page(struct address_space *mapping, - struct page *page) +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -4992,7 +4992,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif - .error_remove_page = shmem_error_remove_page, + .error_remove_folio = shmem_error_remove_folio, }; EXPORT_SYMBOL(shmem_aops); diff --git a/mm/truncate.c b/mm/truncate.c index d0e4716e65389d06d61ab89e026393d65c7c2ad1..cda5f6623a8211ffa841bf0c25c1fc17b0289daf 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -251,10 +251,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) /* * Used to get rid of pages on hardware memory corruption. */ -int generic_error_remove_page(struct address_space *mapping, struct page *page) +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (!mapping) return -EINVAL; /* @@ -263,13 +262,26 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_folio(mapping, page_folio(page)); + return truncate_inode_folio(mapping, folio); } -EXPORT_SYMBOL(generic_error_remove_page); +EXPORT_SYMBOL(generic_error_remove_folio); -static long mapping_evict_folio(struct address_space *mapping, - struct folio *folio) +/** + * mapping_evict_folio() - Remove an unused folio from the page-cache. + * @mapping: The mapping this folio belongs to. + * @folio: The folio to remove. + * + * Safely remove one folio from the page cache. + * It only drops clean, unused folios. + * + * Context: Folio must be locked. + * Return: The number of pages successfully removed. + */ +long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { + /* The page may have been truncated before it was locked */ + if (!mapping) + return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ @@ -285,25 +297,11 @@ static long mapping_evict_folio(struct address_space *mapping, return remove_mapping(mapping, folio); } -/** - * invalidate_inode_page() - Remove an unused page from the pagecache. - * @page: The page to remove. - * - * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. - * - * Context: Page must be locked. - * Return: The number of pages successfully removed. - */ long invalidate_inode_page(struct page *page) { struct folio *folio = page_folio(page); - struct address_space *mapping = folio_mapping(folio); - /* The page may have been truncated before it was locked */ - if (!mapping) - return 0; - return mapping_evict_folio(mapping, folio); + return mapping_evict_folio(folio_mapping(folio), folio); } /** diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e65f4170425c4cd7460185dccf288d4b794d4d68..eccf0e9011310919099cc75d981b75ab453fe007 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -271,7 +271,7 @@ static int kvm_gmem_migrate_folio(struct address_space *mapping, return -EINVAL; } -static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) +static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { struct list_head *gmem_list = &mapping->private_list; struct kvm_gmem *gmem; @@ -279,8 +279,8 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) filemap_invalidate_lock_shared(mapping); - start = page->index; - end = start + thp_nr_pages(page); + start = folio->index; + end = start + folio_nr_pages(folio); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); @@ -307,7 +307,7 @@ static const struct address_space_operations kvm_gmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = kvm_gmem_migrate_folio, #endif - .error_remove_page = kvm_gmem_error_page, + .error_remove_folio = kvm_gmem_error_folio, }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,