diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 56be85edd62f40a324d87d1071a2ac29597858f3..41dac77e91e7b1b24b7c823283bda1e917674fe1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -910,6 +910,7 @@ const struct file_operations ext4_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, + .fop_flags = FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ba9e2fa4f1c0f3e9d973da72d0445b0e1b99097a..e2449295baaa6b175148b77f621ba8bfaa9c8ac7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1473,6 +1473,7 @@ const struct file_operations xfs_file_operations = { .fallocate = xfs_file_fallocate, .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 8e9d1c945dc22d9d37cf8433967ee54bcbc94c5d..bfaaa84652166b359726032c1376cbbe5d1f2852 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -312,6 +312,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -1964,8 +1965,11 @@ struct dir_context { struct iov_iter; struct io_uring_cmd; +typedef unsigned int __bitwise fop_flags_t; + struct file_operations { struct module *owner; + fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); @@ -2015,6 +2019,8 @@ struct file_operations { CK_KABI_RESERVE(4) } __randomize_layout; +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); @@ -3462,6 +3468,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d2022ec30f9917a41016396d26a088d03af9c1ff..7e9a5c5be7f250e137b695a34a90df0aff37b887 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -124,6 +124,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -483,6 +484,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Dropbehind, dropbehind, PF_NO_COMPOUND) + TESTCLEARFLAG(Dropbehind, dropbehind, PF_HEAD) + __SETPAGEFLAG(Dropbehind, dropbehind, PF_HEAD) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 46ea8cb4110b0fdac0faa48be952f57d8ba28491..d9fa8811e9ec284ab39b53f8db0f1c85d153180a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -841,6 +841,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 8b93b136e10122c219f226c24a291a12b5d21c1d..b6c0043bcee995c08ce422c189d4a1e4c6f10f4a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -120,7 +120,8 @@ {1UL << PG_mappedtodisk, "mappedtodisk" }, \ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ - {1UL << PG_unevictable, "unevictable" } \ + {1UL << PG_unevictable, "unevictable" }, \ + {1UL << PG_dropbehind, "dropbehind" } \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afdd8cea369af1395c3637a5f69122d..8a505280508caef7f69d88f68256f9a96fe6e5a4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,8 +300,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_DONTCACHE) #endif /* _UAPI_LINUX_FS_H */ diff --git a/mm/filemap.c b/mm/filemap.c index fb6f25462df6d9666638ffa0a668f0c3ef0d1f23..0916eb2f09995ca20ad6d902703b70a614ad60e9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1589,6 +1589,16 @@ void page_endio(struct page *page, bool is_write, int err) } EXPORT_SYMBOL_GPL(page_endio); +static void filemap_end_dropbehind(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (mapping && !PageWriteback(page) && !PageDirty(page)) + page_unmap_invalidate(mapping, page, 0); +} + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @__page: the page to lock @@ -2383,6 +2393,8 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb, page = page_cache_alloc(mapping); if (!page) return ERR_PTR(-ENOMEM); + if (iocb->ki_flags & IOCB_DONTCACHE) + __SetPageDropbehind(page); error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); @@ -2401,7 +2413,6 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; int i, j, nr_got, err = 0; @@ -2418,7 +2429,11 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + + page_cache_sync_ra(&ractl, last_index - index); nr_got = find_get_pages_contig(mapping, index, nr, pages); if (nr_got) @@ -2444,8 +2459,11 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb, err = -EAGAIN; break; } - page_cache_async_readahead(mapping, ra, filp, page, - pg_index, last_index - pg_index); + DEFINE_READAHEAD(ractl_async, filp, &filp->f_ra, mapping, pg_index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl_async.dropbehind = 1; + + page_cache_async_ra(&ractl_async, page, last_index - pg_index); } if (!PageUptodate(page)) { @@ -2487,6 +2505,20 @@ static inline bool pos_same_page(loff_t pos1, loff_t pos2, struct page *page) return (pos1 >> shift == pos2 >> shift); } + +static void filemap_end_dropbehind_read(struct page *page) +{ + if (!PageDropbehind(page)) + return; + if (PageWriteback(page) || PageDirty(page)) + return; + if (trylock_page(page)) { + if (TestClearPageDropbehind(page)) + filemap_end_dropbehind(page); + unlock_page(page); + } +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2610,8 +2642,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } } put_pages: - for (i = 0; i < pg_nr; i++) - put_page(pages[i]); + for (i = 0; i < pg_nr; i++) { + struct page *page = pages[i]; + + filemap_end_dropbehind_read(page); + put_page(page); + } } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); diff --git a/mm/internal.h b/mm/internal.h index 85a392d6b1c3d110f2359e586ab09211ed29bfa1..2acb3e1d742bebd5d3fbe7aa36a548434f06f293 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -50,6 +50,8 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +int page_unmap_invalidate(struct address_space *mapping, struct page *page, + gfp_t gfp); void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/readahead.c b/mm/readahead.c index f5fa99d71a12dd9f9b0d29cc424f349974b86d25..ebf2f4fe0e2233c43781e913b7d61cf0901fa420 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -161,6 +161,18 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, rac->_index++; } +static struct page *ractl_alloc_page(struct readahead_control *ractl, + gfp_t gfp_mask) +{ + struct page *page; + + page = __page_cache_alloc(gfp_mask); + if (page && ractl->dropbehind) + __SetPageDropbehind(page); + + return page; +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -217,7 +229,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) break; if (mapping->a_ops->readpages) { @@ -694,7 +706,7 @@ void readahead_expand(struct readahead_control *ractl, if (page && !xa_is_value(page)) return; /* Page apparently present */ - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) return; if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { @@ -717,7 +729,7 @@ void readahead_expand(struct readahead_control *ractl, if (page && !xa_is_value(page)) return; /* Page apparently present */ - page = __page_cache_alloc(gfp_mask); + page = ractl_alloc_page(ractl, gfp_mask); if (!page) return; if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { diff --git a/mm/swap.c b/mm/swap.c index 299d09b2d2857e64d233cca5fb7b0fa704fa42e2..86506c16d0b8eb01120fa66cdfc77a0e6df7b540 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -458,6 +458,9 @@ void mark_page_accessed(struct page *page) { page = compound_head(page); + if (PageDropbehind(page)) + return; + if (lru_gen_enabled()) { page_inc_refs(page); return; diff --git a/mm/truncate.c b/mm/truncate.c index 4b85a4d5d0607a8978d2989a4128d01428ac9027..149aa122a04d67dc9dd7e2961d5916a033753771 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -668,6 +668,15 @@ void invalidate_mapping_pagevec(struct address_space *mapping, __invalidate_mapping_pages(mapping, start, end, nr_pagevec); } +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -675,16 +684,26 @@ void invalidate_mapping_pagevec(struct address_space *mapping, * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +int page_unmap_invalidate(struct address_space *mapping, struct page *page, + gfp_t gfp) { unsigned long flags; + int ret; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (page_mapped(page)) + unmap_mapping_page(page); + BUG_ON(page_mapped(page)); + ret = do_launder_page(mapping, page); + if (ret) + return ret; if (page->mapping != mapping) - return 0; + return -EBUSY; - if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return 0; + if (page_has_private(page) && !try_to_release_page(page, gfp)) + return -EBUSY; xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) @@ -712,15 +731,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 0; } -static int do_launder_page(struct address_space *mapping, struct page *page) -{ - if (!PageDirty(page)) - return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) - return 0; - return mapping->a_ops->launder_page(page); -} - /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -784,15 +794,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } wait_on_page_writeback(page); - if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); - - ret2 = do_launder_page(mapping, page); - if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) - ret2 = -EBUSY; - } + ret2 = page_unmap_invalidate(mapping, page, GFP_KERNEL); if (ret2 < 0) ret = ret2; unlock_page(page);