From 4ff7549520bfba48c310ec6cc17acf746c40a122 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 1 Sep 2022 15:44:37 +0800 Subject: [PATCH 1/3] anolis: cachefiles: maintain a file descriptor to the backing file ANBZ: #2056 Allocate and initialize the file descriptor of the backing file at the lookup phase, in prep for the following support for readahead in on-demand mode. This file descriptor is only maintained in on-demand mode. One thing worth noting is that, the file descriptor is opened with FMODE_RANDOM, so that the following page_cache_sync_ra() will fallback to force_page_cache_ra(). The following readahead routine in on-demand mode will trigger force readahead on backing files to read from backing files. We'd better make the implementation self-contained so that later the related kernel modules can be distributed and deployed directly without upgrading the kernel. However force_page_cache_ra() is not exported. To work around this, set FMODE_RANDOM on the file descriptor, and call page_cache_sync_readahead() instead. Besides, implement the write routine of the anonymous fd with buffer IO instead, which can also be facilitated from the pre-allocated file descriptor. Signed-off-by: Jingbo Xu Link: https://gitee.com/anolis/cloud-kernel/pulls/692 Reviewed-by: Joseph Qi --- fs/cachefiles/interface.c | 6 ++++++ fs/cachefiles/internal.h | 1 + fs/cachefiles/namei.c | 18 ++++++++++++++++++ fs/cachefiles/ondemand.c | 20 ++------------------ 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 80a241638452..60b6ca443e8e 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -308,6 +308,12 @@ static void cachefiles_drop_object(struct fscache_object *_object) object->backer = NULL; } + /* clean up file descriptor for non-index object */ + if (object->file) { + fput(object->file); + object->file = NULL; + } + /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) cachefiles_mark_object_inactive(cache, object, i_blocks); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7440c3e4fd14..cc13c460f4b2 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -39,6 +39,7 @@ struct cachefiles_object { struct cachefiles_lookup_data *lookup_data; /* cached lookup data */ struct dentry *dentry; /* the file/dir representing this object */ struct dentry *backer; /* backing file */ + struct file *file; /* backing file in on-demand mode */ loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 22a409669fd0..3c7168d0beec 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -705,6 +705,24 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, if (object->dentry->d_sb->s_blocksize > PAGE_SIZE) goto check_error; + if (cachefiles_in_ondemand_mode(cache)) { + struct path path; + struct file *file; + + path.mnt = cache->mnt; + path.dentry = object->dentry; + file = dentry_open(&path, O_RDWR | O_LARGEFILE, + cache->cache_cred); + if (IS_ERR(file)) + goto check_error; + /* + * so that page_cache_sync_readahead() will fallback + * to force_page_cache_readahead() + */ + file->f_mode |= FMODE_RANDOM; + object->file = file; + } + object->backer = object->dentry; } else { BUG(); // TODO: open file in data-class subdir diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index ad3330f9e3d1..a7c3e5a6e0b9 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -51,30 +51,14 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, struct iov_iter *iter) { struct cachefiles_object *object = kiocb->ki_filp->private_data; - struct cachefiles_cache *cache; size_t len = iter->count; loff_t pos = kiocb->ki_pos; - struct path path; - struct file *file; int ret; - if (!object->backer) - return -ENOBUFS; - - cache = container_of(object->fscache.cache, - struct cachefiles_cache, cache); - - /* write data to the backing filesystem and let it store it in its - * own time */ - path.mnt = cache->mnt; - path.dentry = object->backer; - file = dentry_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - cache->cache_cred); - if (IS_ERR(file)) + if (!object->file) return -ENOBUFS; - ret = vfs_iter_write(file, iter, &pos, 0); - fput(file); + ret = vfs_iter_write(object->file, iter, &pos, 0); if (ret != len) return -EIO; return len; -- Gitee From ee8211a46fb77c917a7f771060351d5fc1fd7c9a Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 25 Aug 2022 17:23:34 +0800 Subject: [PATCH 2/3] anolis: fscache,cachefiles: add fscache_prepare_read() helper ANBZ: #2056 Fscache/CacheFiles offer fscache_read_or_alloc_pages() to implement the readahead routine of filesystems using fscache. The implementation of fscache_read_or_alloc_pages() will call .readpage() on each backpage, in which case each backpage will generate an IO request. The performance bottleneck is not an issue when fscache is used as the local cache for network filesystems. However it is not the case for filesystems using fscache in on-demand mode. This patch introduces a new helper fscache_prepare_read() for this use. It first checks if there's any hole inside the requested range, and triggers on-demand read if there's any. This step ensures that all the data is ready there for the requested range. Then it triggers an asynchronous readahead for the backing file. Since FMODE_RANDOM, the following page_cache_sync_readahead() will fallback to force_page_cache_readahead(). At last it will start a synchronous buffer read on the backing file. Since the asynchronous readahead, the following buffer read will find the page cache up to date most times. The buffer read is handled in the context of workers, so that the readahead routine will not be blocked in the synchronous buffer read. Signed-off-by: Jingbo Xu Link: https://gitee.com/anolis/cloud-kernel/pulls/692 Reviewed-by: Joseph Qi --- fs/cachefiles/interface.c | 1 + fs/cachefiles/internal.h | 1 + fs/cachefiles/rdwr.c | 162 ++++++++++++++++++++++++++++++++++ fs/fscache/page.c | 66 ++++++++++++++ include/linux/fscache-cache.h | 5 ++ include/linux/fscache.h | 20 +++++ 6 files changed, 255 insertions(+) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 60b6ca443e8e..634e7041c0f3 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -573,6 +573,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .attr_changed = cachefiles_attr_changed, .read_or_alloc_page = cachefiles_read_or_alloc_page, .read_or_alloc_pages = cachefiles_read_or_alloc_pages, + .prepare_read = cachefiles_prepare_read, .allocate_page = cachefiles_allocate_page, .allocate_pages = cachefiles_allocate_pages, .write_page = cachefiles_write_page, diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cc13c460f4b2..a787fe8ef8cc 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -240,6 +240,7 @@ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *, struct list_head *, unsigned *, gfp_t); +extern int cachefiles_prepare_read(struct fscache_retrieval *op, pgoff_t index); extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *, gfp_t); extern int cachefiles_allocate_pages(struct fscache_retrieval *, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 7cfbbeee9e87..0e1992bedf71 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include "internal.h" /* @@ -793,6 +795,166 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, return -ENOBUFS; } +static int cachefiles_ondemand_check(struct cachefiles_object *object, + loff_t start_pos, size_t len) +{ + struct file *file = object->file; + size_t remained; + loff_t pos; + int ret; + + /* make sure there's no hole in the requested range */ + pos = start_pos; + remained = len; + + while (remained) { + bool again = true; + size_t count = remained; + loff_t off, off2, new_pos; +retry: + off = vfs_llseek(file, pos, SEEK_DATA); + if (off < 0) { + if (off == (loff_t)-ENXIO) + goto ondemand_read; + return -ENODATA; + } + + if (off >= pos + remained) + goto ondemand_read; + + if (off > pos) { + count = off - pos; + goto ondemand_read; + } + + off2 = vfs_llseek(file, pos, SEEK_HOLE); + if (off2 < 0) + return -ENODATA; + + new_pos = min_t(loff_t, off2, pos + remained); + remained -= new_pos - pos; + pos = new_pos; + continue; +ondemand_read: + if (again) { + ret = cachefiles_ondemand_read(object, pos, count); + if (!ret) { + /* recheck if the hole has been filled or not */ + again = false; + goto retry; + } + } + return -ENODATA; + } + return 0; +} + +struct cachefiles_kiocb { + struct kiocb iocb; + struct fscache_retrieval *op; + struct iov_iter iter; + struct work_struct work; + struct bio_vec bvs[]; +}; + +void cachefiles_readpages_work_func(struct work_struct *work) +{ + struct cachefiles_kiocb *ki = container_of(work, struct cachefiles_kiocb, work); + int ret; + + ret = vfs_iocb_iter_read(ki->iocb.ki_filp, &ki->iocb, &ki->iter); + /* complete the request if there's any progress or error occurred */ + if (ret != -EIOCBQUEUED) { + struct fscache_retrieval *op = ki->op; + unsigned int nr_pages = atomic_read(&op->n_pages); + unsigned int done_pages = 0; + int i, error; + + if (ret > 0) + done_pages = ret / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + error = i < done_pages ? 0 : -EIO; + fscache_end_io(op, ki->bvs[i].bv_page, error); + } + + fscache_retrieval_complete(op, nr_pages); + fscache_put_retrieval(op); + kfree(ki); + } +} + +int cachefiles_prepare_read(struct fscache_retrieval *op, pgoff_t index) +{ + struct cachefiles_object *object; + struct cachefiles_kiocb *ki; + loff_t start_pos = op->offset; + unsigned int n, nr_pages = atomic_read(&op->n_pages); + size_t len = nr_pages << PAGE_SHIFT; + struct page **pages; + size_t size; + int i, ret; + + object = container_of(op->op.object, struct cachefiles_object, fscache); + if (!object->backer) + goto all_enobufs; + + /* + * 1. Check if there's hole in the requested range, and trigger an + * on-demand read request if there's any. + */ + ASSERT(start_pos % PAGE_SIZE == 0); + ret = cachefiles_ondemand_check(object, start_pos, len); + if (ret) + goto all_enobufs; + + /* + * 2. Trigger readahead on the backing file in advance. Since + * FMODE_RANDOM, the following page_cache_sync_readahead() will fallback + * to force_page_cache_readahead(). + */ + page_cache_sync_readahead(d_inode(object->backer)->i_mapping, + &object->file->f_ra, object->file, + start_pos / PAGE_SIZE, nr_pages); + + size = sizeof(struct cachefiles_kiocb) + nr_pages * sizeof(struct bio_vec); + ki = kzalloc(size, GFP_KERNEL); + if (!ki) + goto all_enobufs; + + /* reuse the tailing part of ki as pages[] */ + pages = (void *)ki + size - nr_pages * sizeof(struct page *); + n = find_get_pages_contig(op->mapping, index, nr_pages, pages); + if (WARN_ON(n != nr_pages)) { + for (i = 0; i < n; i++) + put_page(pages[i]); + kfree(ki); + goto all_enobufs; + } + + for (i = 0; i < n; i++) { + put_page(pages[i]); + ki->bvs[i].bv_page = pages[i]; + ki->bvs[i].bv_offset = 0; + ki->bvs[i].bv_len = PAGE_SIZE; + } + iov_iter_bvec(&ki->iter, READ, ki->bvs, n, n * PAGE_SIZE); + + ki->iocb.ki_filp = object->file; + ki->iocb.ki_pos = start_pos; + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->op = fscache_get_retrieval(op); + + /* 3. Start a buffer read in worker context */ + INIT_WORK(&ki->work, cachefiles_readpages_work_func); + queue_work(system_unbound_wq, &ki->work); + return 0; + +all_enobufs: + fscache_retrieval_complete(op, nr_pages); + return -ENOBUFS; +} + /* * allocate a block in the cache in which to store a page * - cache withdrawal is prevented by the caller diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 888ace2cc6e1..39a05a43284d 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -666,6 +666,72 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, } EXPORT_SYMBOL(__fscache_read_or_alloc_pages); +int __fscache_prepare_read(struct fscache_cookie *cookie, + struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, loff_t start_pos, + fscache_rw_complete_t term_func, void *context) +{ + struct fscache_retrieval *op; + struct fscache_object *object; + bool wake_cookie = false; + int ret; + + if (hlist_empty(&cookie->backing_objects)) + return -ENOBUFS; + + if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { + _leave(" = -ENOBUFS [invalidating]"); + return -ENOBUFS; + } + + ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + op = fscache_alloc_retrieval(cookie, mapping, term_func, context); + if (!op) + return -ENOMEM; + atomic_set(&op->n_pages, nr_pages); + op->offset = start_pos; + + spin_lock(&cookie->lock); + + if (!fscache_cookie_enabled(cookie) || + hlist_empty(&cookie->backing_objects)) + goto nobufs_unlock; + + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + + __fscache_use_cookie(cookie); + if (fscache_submit_op(object, &op->op) < 0) + goto nobufs_unlock_dec; + spin_unlock(&cookie->lock); + + ret = fscache_wait_for_operation_activation( + object, &op->op, + __fscache_stat(&fscache_n_retrieval_op_waits), + __fscache_stat(&fscache_n_retrievals_object_dead)); + if (ret < 0) + goto out; + + ret = object->cache->ops->prepare_read(op, index); +out: + fscache_put_retrieval(op); + return ret; + +nobufs_unlock_dec: + wake_cookie = __fscache_unuse_cookie(cookie); +nobufs_unlock: + spin_unlock(&cookie->lock); + fscache_put_retrieval(op); + if (wake_cookie) + __fscache_wake_unused_cookie(cookie); + return -ENOBUFS; +} +EXPORT_SYMBOL(__fscache_prepare_read); + /* * allocate a block in the cache on which to store a page * - we return: diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 71ee23f78f1d..31f2f13e2924 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -161,6 +161,9 @@ typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op, unsigned *nr_pages, gfp_t gfp); +typedef int (*fscache_prepare_read_func_t)(struct fscache_retrieval *op, + pgoff_t index); + /** * fscache_get_retrieval - Get an extra reference on a retrieval operation * @op: The retrieval operation to get a reference on @@ -285,6 +288,8 @@ struct fscache_cache_ops { * the cache */ fscache_pages_retrieval_func_t read_or_alloc_pages; + fscache_prepare_read_func_t prepare_read; + /* request a backing block for a page be allocated in the cache so that * it can be written directly */ fscache_page_retrieval_func_t allocate_page; diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ce51b915ad43..f262446f3a49 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -212,6 +212,13 @@ extern int __fscache_read_or_alloc_pages(struct fscache_cookie *, fscache_rw_complete_t, void *, gfp_t); +extern int __fscache_prepare_read(struct fscache_cookie *cookie, + struct address_space *mapping, + pgoff_t index, + unsigned int nr_pages, + loff_t start_pos, + fscache_rw_complete_t term_func, + void *context); extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t); extern int __fscache_write_page(struct fscache_cookie *, struct page *, loff_t, gfp_t); extern void __fscache_uncache_page(struct fscache_cookie *, struct page *); @@ -616,6 +623,19 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, return -ENOBUFS; } +static inline +int fscache_prepare_read(struct fscache_cookie *cookie, + struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, loff_t start_pos, + fscache_rw_complete_t term_func, void *context) +{ + if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) + return __fscache_prepare_read(cookie, mapping, index, + nr_pages, start_pos, term_func, context); + else + return -ENOBUFS; +} + /** * fscache_alloc_page - Allocate a block in which to store a page * @cookie: The cookie representing the cache object -- Gitee From 958e0b6cc6b1d3430885004a835086f89f00367f Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 18 Aug 2022 14:55:18 +0800 Subject: [PATCH 3/3] anolis: erofs: implement fscache-based data readahead ANBZ: #2056 Implement the readahead routine with fscache_prepare_read(). Besides, register an individual bdi for each erofs instance to enable readahead in on-demand mode. Signed-off-by: Jingbo Xu Link: https://gitee.com/anolis/cloud-kernel/pulls/692 Reviewed-by: Joseph Qi --- fs/erofs/fscache.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++ fs/erofs/super.c | 4 +++ 2 files changed, 85 insertions(+) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 38f3be721799..089fd90e45f5 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -38,6 +38,13 @@ static void erofs_readpage_from_fscache_complete(struct page *page, void *ctx, unlock_page(page); } +static void erofs_readahead_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + erofs_readpage_from_fscache_complete(page, ctx, error); + put_page(page); +} + static int erofs_fscache_meta_readpage(struct file *data, struct page *page) { int ret; @@ -175,6 +182,79 @@ static int erofs_fscache_readpage(struct file *file, struct page *page) return ret; } +static void erofs_fscache_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct super_block *sb = inode->i_sb; + struct page *page; + size_t len, count, done = 0; + erofs_off_t pos; + loff_t start, start_pos; + int ret; + + if (!readahead_count(rac)) + return; + + start = readahead_pos(rac); + len = readahead_length(rac); + + do { + struct erofs_map_blocks map; + struct erofs_map_dev mdev; + + pos = start + done; + + map.m_la = pos; + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret) + return; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + page = readahead_page(rac); + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + unlock_page(page); + put_page(page); + done += PAGE_SIZE; + continue; + } + + if (map.m_flags & EROFS_MAP_META) { + page = readahead_page(rac); + ret = erofs_fscache_readpage_inline(page, &map); + unlock_page(page); + put_page(page); + done += PAGE_SIZE; + continue; + } + + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + + ret = erofs_map_dev(sb, &mdev); + if (ret) + return; + + start_pos = mdev.m_pa + (pos - map.m_la); + count = min_t(size_t, map.m_llen - (pos - map.m_la), len - done); + ret = fscache_prepare_read(mdev.m_fscache->cookie, rac->mapping, + pos / PAGE_SIZE, count / PAGE_SIZE, start_pos, + erofs_readahead_from_fscache_complete, NULL); + if (ret) { + erofs_err(sb, "%s: prepare_read %d", __func__, ret); + return; + } + + done += count; + while (count) { + page = readahead_page(rac); + count -= PAGE_SIZE; + } + } while (done < len); +} + static const struct address_space_operations erofs_fscache_meta_aops = { .readpage = erofs_fscache_meta_readpage, .releasepage = erofs_fscache_release_page, @@ -183,6 +263,7 @@ static const struct address_space_operations erofs_fscache_meta_aops = { const struct address_space_operations erofs_fscache_access_aops = { .readpage = erofs_fscache_readpage, + .readahead = erofs_fscache_readahead, .releasepage = erofs_fscache_release_page, .invalidatepage = erofs_fscache_invalidate_page, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 264809c5770d..2694b26c7ad0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -506,6 +506,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->opt.fsid, true); if (err) return err; + + err = super_setup_bdi(sb); + if (err) + return err; } else { if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { erofs_err(sb, "failed to set erofs blksize"); -- Gitee