diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0166bb9ca160bdb5196aa9731b2c2daebd3c7116..33f8eb715b413750ee899a2a2478cdc67f375ebf 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -144,6 +145,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); + fs_file_read_do_trace(iocb); return generic_file_read_iter(iocb, to); } @@ -165,6 +167,8 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, */ static int ext4_release_file(struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); diff --git a/fs/read_write.c b/fs/read_write.c index 4771701c896badcbc7dc2e3f1f528dc9026864ad..3d69fb284d10ec8ab7efb294b82c326afe2e188d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -24,6 +24,8 @@ #include #include +#define CREATE_TRACE_POINTS +#include const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, @@ -1718,3 +1720,39 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } + +#ifdef CONFIG_TRACEPOINTS +static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, + struct file *filp, loff_t pos) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->name = file_dentry(filp)->d_name.name; + ctx->f_mode = filp->f_mode; + ctx->key = (unsigned long)filp; + ctx->i_size = i_size_read(file_inode(filp)); + ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; + ctx->index = pos >> PAGE_SHIFT; +} + +#define FS_FILE_READ_VERSION 1 +#define FS_FILE_READ_MODE_MASK (FMODE_CTL_RANDOM | FMODE_CTL_WILLNEED) + +void fs_file_read_update_args_by_trace(struct kiocb *iocb) +{ + struct file *filp = iocb->ki_filp; + struct fs_file_read_ctx ctx; + + fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); + trace_fs_file_read(&ctx, FS_FILE_READ_VERSION); + + if (!ctx.set_f_mode && !ctx.clr_f_mode) + return; + + filp->f_ctl_mode |= ctx.set_f_mode & FS_FILE_READ_MODE_MASK; + filp->f_ctl_mode &= ~(ctx.clr_f_mode & FS_FILE_READ_MODE_MASK); +} +EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); +#endif + +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(fs_file_release); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 203700278ddbb620a6e9d7c95b68a86f4ea52c63..5dd59655532588c5c24cd49774e62a4af4c3cde6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,7 @@ #include #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -270,6 +271,7 @@ xfs_file_buffered_read( ssize_t ret; trace_xfs_file_buffered_read(iocb, to); + fs_file_read_do_trace(iocb); ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) @@ -1227,6 +1229,7 @@ xfs_file_release( struct inode *inode, struct file *filp) { + trace_fs_file_release(inode, filp); return xfs_release(XFS_I(inode)); } diff --git a/include/linux/fs.h b/include/linux/fs.h index fb5accebdcdf50c33234888a87adccbd610555c5..d47063a79bcd15a705c904784a5962e44a634b39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -185,6 +186,12 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async nowait buffered writes */ #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) +/* File mode control flag, expect random access pattern */ +#define FMODE_CTL_RANDOM ((__force fmode_t)0x1000) + +/* File mode control flag, will try to read head of the file into pagecache */ +#define FMODE_CTL_WILLNEED ((__force fmode_t)0x400000) + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -1027,6 +1034,7 @@ struct file { struct address_space *f_mapping; errseq_t f_wb_err; errseq_t f_sb_err; /* for syncfs */ + fmode_t f_ctl_mode; } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -3379,4 +3387,33 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_mode; + /* set into f_ctl_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(fs_file_read); +extern void fs_file_read_update_args_by_trace(struct kiocb *iocb); +#else +static inline void fs_file_read_update_args_by_trace(struct kiocb *iocb) {} +#endif + +static inline void fs_file_read_do_trace(struct kiocb *iocb) +{ + if (tracepoint_enabled(fs_file_read)) + fs_file_read_update_args_by_trace(iocb); +} #endif /* _LINUX_FS_H */ diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h new file mode 100644 index 0000000000000000000000000000000000000000..ee82dad9d9dadc1f7d9bfe715a1e489ed5a2bd9c --- /dev/null +++ b/include/trace/events/fs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include +#include + +#undef FS_DECLARE_TRACE +#ifdef DECLARE_TRACE_WRITABLE +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE_WRITABLE(call, PARAMS(proto), PARAMS(args), size) +#else +#define FS_DECLARE_TRACE(call, proto, args, size) \ + DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) +#endif + +FS_DECLARE_TRACE(fs_file_read, + TP_PROTO(struct fs_file_read_ctx *ctx, int version), + TP_ARGS(ctx, version), + sizeof(struct fs_file_read_ctx)); + +DECLARE_TRACE(fs_file_release, + TP_PROTO(struct inode *inode, struct file *filp), + TP_ARGS(inode, filp)); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/readahead.c b/mm/readahead.c index 6925e6959fd3fff76cdca6b01c348f3daade9c02..a1add9a4f77b9ff3224a3a322648a57aafdeb020 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -131,6 +131,7 @@ #include "internal.h" +#define READAHEAD_FIRST_SIZE (2 * 1024 * 1024) /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -668,10 +669,41 @@ static void ondemand_readahead(struct readahead_control *ractl, page_cache_ra_order(ractl, ra, order); } +/* + * Try to read first @ra_size from head of the file. + */ +static bool page_cache_readahead_from_head(struct address_space *mapping, + struct file *filp, pgoff_t offset, + unsigned long req_size, + unsigned long ra_size) +{ + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long size = min_t(unsigned long, ra_size, + i_size_read(file_inode(filp))); + unsigned long nrpages = round_up(size, PAGE_SIZE); + unsigned long max_pages; + unsigned int offs = 0; + + /* Cannot read date over target size, back to normal way */ + if (offset + req_size > nrpages) + return false; + + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + max_pages = min(max_pages, nrpages); + while (offs < nrpages) { + force_page_cache_readahead(mapping, filp, offs, max_pages); + offs += max_pages; + } + return true; +} + void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { - bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); + bool do_forced_ra = ractl->file && + ((ractl->file->f_mode & FMODE_RANDOM) || + (ractl->file->f_ctl_mode & FMODE_CTL_RANDOM)); /* * Even if readahead is disabled, issue this request as readahead @@ -686,6 +718,12 @@ void page_cache_sync_ra(struct readahead_control *ractl, do_forced_ra = true; } + /* try to read first READAHEAD_FIRST_SIZE into pagecache */ + if (ractl->file && (ractl->file->f_ctl_mode & FMODE_CTL_WILLNEED) && + page_cache_readahead_from_head(ractl->mapping, ractl->file, + readahead_index(ractl), req_count, READAHEAD_FIRST_SIZE)) + return; + /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index caede9b574cb1632fdcb00061a662f6151d61208..c521fb5928c46c4801db29d2c8639fd279c9b3d1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -50,6 +50,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 +TEST_GEN_PROGS += file_read_pattern TEST_INST_SUBDIRS := no_alu32 # Also test bpf-gcc, if present diff --git a/tools/testing/selftests/bpf/file_read_pattern.c b/tools/testing/selftests/bpf/file_read_pattern.c new file mode 100644 index 0000000000000000000000000000000000000000..5ee9e1a18ae52d5d14d787a3cf1984e6607b815b --- /dev/null +++ b/tools/testing/selftests/bpf/file_read_pattern.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define READ_TP_NAME "fs_file_read" +#define RELEASE_TP_NAME "fs_file_release" + +int main(int argc, char *argv[]) +{ + const char *name = "./file_read_pattern_prog.bpf.o"; + struct bpf_object *obj; + struct bpf_program *prog; + int err = 0; + int read_fd; + int release_fd; + + obj = bpf_object__open_file(name, NULL); + if (!obj) { + printf("Failed to open program: %s\n", name); + err = -errno; + return err; + } + + err = bpf_object__load(obj); + if (err) { + printf("failed to load program: %s\n", name); + goto out; + } + + prog = bpf_object__find_program_by_name(obj, READ_TP_NAME); + if (!prog) { + printf("no prog %s\n", READ_TP_NAME); + err = -errno; + goto out; + } + + read_fd = bpf_raw_tracepoint_open(READ_TP_NAME, bpf_program__fd(prog)); + if (read_fd < 0) { + printf("Failed to attach raw tracepoint %s\n", READ_TP_NAME); + err = read_fd; + goto out; + } + + prog = bpf_object__find_program_by_name(obj, RELEASE_TP_NAME); + if (!prog) { + printf("no prog %s\n", RELEASE_TP_NAME); + err = -errno; + close(read_fd); + goto out; + } + + release_fd = bpf_raw_tracepoint_open(RELEASE_TP_NAME, + bpf_program__fd(prog)); + if (release_fd < 0) { + printf("Failed to attach raw tracepoint %s\n", RELEASE_TP_NAME); + err = release_fd; + close(read_fd); + goto out; + } + + pause(); + + close(release_fd); + close(read_fd); +out: + bpf_object__close(obj); + return err; +} diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c new file mode 100644 index 0000000000000000000000000000000000000000..bd9c43a5a06bce90ed138896c5982805581267b1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include + +#include + +/* Need to keep consistent with definitions in include/linux/fs.h */ +#define FMODE_CTL_RANDOM 0x1000 +#define FMODE_CTL_WILLNEED 0x400000 + +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_ctl_mode */ + unsigned int clr_f_mode; + /* set into f_ctl_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hist { + __u64 last_nsec; + __u32 seq_nr; + __u32 tot_nr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, struct file_rd_hist); + __uint(max_entries, 10000); +} htab SEC(".maps"); + +static bool is_expected_file(void *name) +{ + char prefix[5]; + int err; + + err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) + return false; + return !strncmp(prefix, "blk_", 4); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + struct file_rd_hist *hist; + struct file_rd_hist new_hist; + __u64 key; + __u64 now; + bool first; + + if (!is_expected_file((void *)rd_ctx->name)) + return 0; + + if (rd_ctx->i_size <= (4 << 20)) { + rd_ctx->set_f_mode = FMODE_CTL_WILLNEED; + return 0; + } + + first = false; + now = bpf_ktime_get_ns(); + key = rd_ctx->key; + hist = bpf_map_lookup_elem(&htab, &key); + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) + hist->seq_nr += 1; + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + /* 500ms or 10 read */ + if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { + if (hist->tot_nr >= 10) { + if (hist->seq_nr <= hist->tot_nr * 3 / 10) + rd_ctx->set_f_mode = FMODE_CTL_RANDOM; + else if (hist->seq_nr >= hist->tot_nr * 7 / 10) + rd_ctx->clr_f_mode = FMODE_CTL_RANDOM; + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (value) + bpf_map_delete_elem(&htab, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL";