diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7dad8ade55a08328a09ec4e1a4f7496..296004b243c6215924873ec17e8b3a783250f814 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -144,6 +145,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); + fs_file_read_do_trace(iocb); return generic_file_read_iter(iocb, to); } @@ -154,6 +156,8 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) */ static int ext4_release_file(struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); diff --git a/fs/read_write.c b/fs/read_write.c index a21ba3be7dbe73024125c2a4d619d65185456717..c050dffe6a4e7b1989d615676750188785dde7a9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@ #include #include +#define CREATE_TRACE_POINTS +#include const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -1718,3 +1720,39 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } + +#ifdef CONFIG_TRACEPOINTS +static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, + struct file *filp, loff_t pos) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->name = file_dentry(filp)->d_name.name; + ctx->f_mode = filp->f_mode; + ctx->key = (unsigned long)filp; + ctx->i_size = file_inode(filp)->i_size; + ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; + ctx->index = pos >> PAGE_SHIFT; +} + +#define FS_FILE_READ_VERSION 1 +#define FS_FILE_READ_MODE_MASK (FMODE_CTL_RANDOM | FMODE_CTL_WILLNEED) + +void fs_file_read_update_args_by_trace(struct kiocb *iocb) +{ + struct file *filp = iocb->ki_filp; + struct fs_file_read_ctx ctx; + + fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); + trace_fs_file_read(&ctx, FS_FILE_READ_VERSION); + + if (!ctx.set_f_mode && !ctx.clr_f_mode) + return; + + filp->f_ctl_mode |= ctx.set_f_mode & FS_FILE_READ_MODE_MASK; + filp->f_ctl_mode &= ~(ctx.clr_f_mode & FS_FILE_READ_MODE_MASK); +} +EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); +#endif + +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8aecc554b8694d88b183aef0c7578..66ab9108fefd89e93b145064da935154b31b153f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,7 @@ #include #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -270,6 +271,7 @@ xfs_file_buffered_read( ssize_t ret; trace_xfs_file_buffered_read(iocb, to); + fs_file_read_do_trace(iocb); ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) @@ -1205,6 +1207,7 @@ xfs_file_release( struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); return xfs_release(XFS_I(inode)); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb2411478a2dbaacbbee0fec9336c27b..21b5c6a59083c169baf2d4ff9fe907165484a255 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -189,6 +190,12 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async nowait buffered writes */ #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) +/* File mode control flag, expect random access pattern */ +#define FMODE_CTL_RANDOM ((__force fmode_t)0x1000) + +/* File mode control flag, will try to read head of the file into pagecache */ +#define FMODE_CTL_WILLNEED ((__force fmode_t)0x400000) + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -974,6 +981,7 @@ struct file { atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; + fmode_t f_ctl_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; @@ -3205,4 +3213,33 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_mode; + /* set into f_ctl_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(fs_file_read); +extern void fs_file_read_update_args_by_trace(struct kiocb *iocb); +#else +static inline void fs_file_read_update_args_by_trace(struct kiocb *iocb) {} +#endif + +static inline void fs_file_read_do_trace(struct kiocb *iocb) +{ + if (tracepoint_enabled(fs_file_read)) + fs_file_read_update_args_by_trace(iocb); +} #endif /* _LINUX_FS_H */ diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h new file mode 100644 index 0000000000000000000000000000000000000000..ee82dad9d9dadc1f7d9bfe715a1e489ed5a2bd9c --- /dev/null +++ b/include/trace/events/fs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include +#include + +#undef FS_DECLARE_TRACE +#ifdef DECLARE_TRACE_WRITABLE +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size) +#else +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#endif + +FS_DECLARE_TRACE(fs_file_read, + TP_PROTO(struct fs_file_read_ctx *ctx, int version), + TP_ARGS(ctx, version), + sizeof(struct fs_file_read_ctx)); + +DECLARE_TRACE(fs_file_release, + TP_PROTO(struct inode *inode, struct file *filp), + TP_ARGS(inode, filp)); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122e44617bad4afec283ec7b780ec49..7148f6f42ed437f5a6cb4f6d728362471e836d0f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -132,6 +132,7 @@ #include "internal.h" +#define READAHEAD_FIRST_SIZE (2 * 1024 * 1024) /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -682,10 +683,41 @@ static void ondemand_readahead(struct readahead_control *ractl, page_cache_ra_order(ractl, ra, order); } +/* + * Try to read first @ra_size from head of the file. + */ +static bool page_cache_readahead_from_head(struct address_space *mapping, + struct file *filp, pgoff_t offset, + unsigned long req_size, + unsigned long ra_size) +{ + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long size = min_t(unsigned long, ra_size, + file_inode(filp)->i_size); + unsigned long nrpages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long max_pages; + unsigned int offs = 0; + + /* Cannot read date over target size, back to normal way */ + if (offset + req_size > nrpages) + return false; + + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + max_pages = min(max_pages, nrpages); + while (offs < nrpages) { + force_page_cache_readahead(mapping, filp, offs, max_pages); + offs += max_pages; + } + return true; +} + void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { - bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); + bool do_forced_ra = ractl->file && + ((ractl->file->f_mode & FMODE_RANDOM) || + (ractl->file->f_ctl_mode & FMODE_CTL_RANDOM)); /* * Even if readahead is disabled, issue this request as readahead @@ -700,6 +732,12 @@ void page_cache_sync_ra(struct readahead_control *ractl, do_forced_ra = true; } + /* try to read first READAHEAD_FIRST_SIZE into pagecache */ + if (ractl->file && (ractl->file->f_ctl_mode & FMODE_CTL_WILLNEED) && + page_cache_readahead_from_head(ractl->mapping, ractl->file, + ractl->_index, req_count, READAHEAD_FIRST_SIZE)) + return; + /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 28d2c77262bed75b33baa51f06a4c413d7bf58fb..ab25af503d60df5066fcab1e3ad33e47985f37da 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -41,6 +41,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 +TEST_GEN_PROGS += file_read_pattern # Also test bpf-gcc, if present ifneq ($(BPF_GCC),) diff --git a/tools/testing/selftests/bpf/file_read_pattern.c b/tools/testing/selftests/bpf/file_read_pattern.c new file mode 100644 index 0000000000000000000000000000000000000000..54d1f869bf04e8a6070d8caae7157a42ec1df4c4 --- /dev/null +++ b/tools/testing/selftests/bpf/file_read_pattern.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define READ_TP_NAME "fs_file_read" +#define RELEASE_TP_NAME "fs_file_release" + +int main(int argc, char *argv[]) +{ + const char *name = "./file_read_pattern_prog.bpf.o"; + struct bpf_object *obj; + struct bpf_program *prog; + int err = 0; + int read_fd; + int release_fd; + + obj = bpf_object__open_file(name, NULL); + if (!obj) { + printf("Failed to open program: %s\n", name); + return err; + } + + err = bpf_object__load(obj); + if (err) { + printf("failed to load program: %s\n", name); + goto out; + } + + prog = bpf_object__find_program_by_name(obj, READ_TP_NAME); + if (!prog) { + printf("no prog %s\n", READ_TP_NAME); + err = -EINVAL; + goto out; + } + + read_fd = bpf_raw_tracepoint_open(READ_TP_NAME, bpf_program__fd(prog)); + if (read_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", READ_TP_NAME); + goto out; + } + + prog = bpf_object__find_program_by_name(obj, RELEASE_TP_NAME); + if (!prog) { + printf("no prog %s\n", RELEASE_TP_NAME); + err = -EINVAL; + goto out; + } + + release_fd = bpf_raw_tracepoint_open(RELEASE_TP_NAME, + bpf_program__fd(prog)); + if (release_fd < 0) { + err = -errno; + printf("Failed to attach raw tracepoint %s\n", RELEASE_TP_NAME); + goto out; + } + + pause(); + + close(release_fd); + close(read_fd); +out: + bpf_object__close(obj); + return err; +} diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c new file mode 100644 index 0000000000000000000000000000000000000000..4fd25c242805b703c0d92cec63dbdcec9c612ffb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include + +#include + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +/* Need to keep consistent with definitions in include/linux/fs.h */ +#define FMODE_CTL_RANDOM 0x1000 +#define FMODE_CTL_WILLNEED 0x400000 + +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_mode; + /* set into f_ctl_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hist { + __u64 last_nsec; + __u32 seq_nr; + __u32 tot_nr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct file_rd_hist); + __uint(max_entries, 10000); +} htab SEC(".maps"); + +static __always_inline bool is_expected_file(void *name) +{ + char prefix[5]; + int err; + + err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) + return false; + return !strncmp(prefix, "blk_", 4); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + struct file_rd_hist *hist; + struct file_rd_hist new_hist; + __u64 key; + __u64 now; + bool first; + + if (!is_expected_file((void *)rd_ctx->name)) + return 0; + + if (rd_ctx->i_size <= (4 << 20)) { + rd_ctx->set_f_mode = FMODE_CTL_WILLNEED; + return 0; + } + + first = false; + now = bpf_ktime_get_ns(); + key = rd_ctx->key; + hist = bpf_map_lookup_elem(&htab, &key); + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) + hist->seq_nr += 1; + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + /* 500ms or 10 read */ + if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { + if (hist->tot_nr >= 10) { + if (hist->seq_nr <= hist->tot_nr * 3 / 10) + rd_ctx->set_f_mode = FMODE_CTL_RANDOM; + else if (hist->seq_nr >= hist->tot_nr * 7 / 10) + rd_ctx->clr_f_mode = FMODE_CTL_RANDOM; + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (value) + bpf_map_delete_elem(&htab, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1;