diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 09cade7eaefc8c68e4e733ea011b345a4ce9fc9b..6ca507138d8ebaf7d6c228e8f4db642b84262669 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -82,6 +82,7 @@ Documentation for filesystem implementations. ecryptfs efivarfs erofs + mfs ext2 ext3 ext4/index diff --git a/Documentation/filesystems/mfs.rst b/Documentation/filesystems/mfs.rst new file mode 100644 index 0000000000000000000000000000000000000000..a291fbf48cf1fd1e2ed5078ce1fe9d0c7e8b62c3 --- /dev/null +++ b/Documentation/filesystems/mfs.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +MFS Filesystem +============== + +Overview +======== + +MFS is a stackable file system that utilizes a lower and a cache layer. It +provides users with programmable caching capabilities. MFS only supports +read-only operations for regular files, directories and symbolic links. When +MFS is stacked on top of the lower and cache layers (which are themselves +mounted on other file systems, such as ext4 or xfs), the underlying file +systems must be also kept read-only to prevent the data inconsistency. + +MFS supports three running mode: none, local and remote. These modes are +explained in running mode section. In short, MFS requires the `mtree` and +`cachedir` mount options. The `mtree` option specifies the metadata source +for MFS, while the `cachedir` option specifies the data source. In local or +remote mode, `cachedir` points to a local cache (in memory or on disk) for +backend file systems. + + +Mount options +============= + +================ ========================================================== +mode=%s Supported running mode. Here are: + + ====== ================================================== + none As a stackable file system on the lower file + system, and just pass through operations to the + backend file system. + local Working at local mode which lower and cachedir + layer are both local file system. And the miss + event (not hit in page cache) will post the async + events to the userspace. + remote Working at remote mode which the target data is in + the remote storage such as OBS or other private + distributed file system without POSIX-like + interface. And the miss event (not hit in local + cache) will post the sync events to the userspace + and waiting for replying. + ====== ================================================== +mtree=%s Lower layer path. +cachedir=%s Cache layer path. +================ ========================================================== + +**NOTE**: The path in `mtree` and `cachedir` options must not be the same as +the mount point, nor can they be a subdirectories of each other. + +Communication Framework +======================= + +Each MFS instance has a unique communication device named `/dev/mfs${minor}`. +MFS sends the MISS events to the user daemon as needed. The user daemon can +obtain these events by polling and reading from the device. To obtain the +minor number for an MFS instance, the user must call `statfs()` on its mount +point and parse the value from `f_spare[0]` in the `struct statfs`. + +Each request starts with a message header of the form:: + + struct mfs_msg { + __u8 version; + __u8 opcode; + __u16 len; + __u32 fd; + __u32 id; + __u8 data[]; + }; + +where: + + * ``version`` indicates the version number which for extension. + + * ``opcode`` indicates the type of the event. + + * ``len`` indicates the whole length of the event, including the + header and the following type-specific payload. + + * ``fd`` indicates the file handle of internal file object. + + * ``id`` is a unique ID identifying the event. + + * ``data`` indicates the payload of the event. + +The MFS will only post reading events when the data is missing in the local +cache (memory or disk). The payload format is define as follows:: + + struct mfs_read { + __u64 off; + __u64 len; + __s32 pid; + }; + +where: + + * ``off`` indicates the offset of the reading request which triggers + this event. + + * ``len`` indicates the length of the reading request which triggers + this event. + + * ``pid`` indicates the pid of the reading process which triggers + this event. + +Currently the opcode is defined as follows:: + + enum mfs_opcode { + MFS_OP_READ = 0, + MFS_OP_FAULT, + MFS_OP_FAROUND, + }; + +where means: normal read event, page fault event and the fault around event +before ongoing fault. + +Running mode +============ + +There are three running mode in MFS: none, local and remote. The user can use +`MFS_IOC_FSINFO ioctl` on device fd to obtain the information. + +The parameter for this request is as follows:: + + struct mfs_ioc_fsinfo { + __u8 mode; /* 0: none, 1: local, 2: remote */ + }; + +where mode will be assigned the value defined in a enum structure as follows:: + + enum { + MFS_MODE_NONE = 0, + MFS_MODE_LOCAL, + MFS_MODE_REMOTE, + }; + +In none mode, MFS does not report any events. It just passes operations +through to the underlying file system. + +In local mode, MFS uses page cache as its local cache. If a read request +results in a cache miss, the MISS events are reported for the non-contiguous +missing range. This is an asynchronous event, which means the kernel does not +block waiting for it. The user daemon can prefetch subsequence data based on +this event to avoid future cache misses. + +In remote mode, MFS uses a local disk (as specified by the `cachedir` mount +option) as its cache. If a read request misses in the local disk cache (checked +using `SEEK_HOLE` and `SEEK_DATA`), the MISS events are reported. This is a +synchronous event, which means the kernel will block on this event and wait +for the user daemon to respond with the corresponding message id. + +The following structure definition will be used for user daemon to respond the +target events:: + + struct mfs_ioc_ra { + __u64 off; + __u64 len; + }; + +where: + + * ``off`` indicates the offset where to prefetch. + + * ``len`` indicates the length to prefetch. + +and it used in local mode by using the `MFS_IOC_RA ioctl` on the `fd` in the +message header. + +When running in remote mode, the use daemon should 1) fetch the target data +from remote storage, 2) write the data to MFS using the `write()` syscal on +the `fd` provided in the message header, 3) reply by calling the +`MFS_IOC_DONE ioctl` with the parameter:: + + struct mfs_ioc_done { + __u32 id; + __u32 ret; + }; + +where: + + * ``id``: indicates the message id in message header. + + * ``ret``: indicates the return code for the event, and 0 means success. + +In some cases, the use daemon may need to obtain the full path of file object +associated with the event to implement more complex strategy, such as one based +on tracing. To do this, it uses the `MFS_IOC_RPATH ioctl` on `fd` which +provided in the message header. The parameter for this is:: + + struct mfs_ioc_rpath { + __u16 max; + __u16 len; + __u8 d[]; + }; + +where: + + * ``max`` indicates the max length of input data area to fill the full + path. + + * ``len`` indicates the real length of the full path. + + * ``d[]`` indicate the input data area allocated by user daemon. + +The user daemon will use the flexible strategy to prefetch the data. So +in MFS, the default prefetch in VFS is disabled. + + +Use cases +========= + +- Boost model weight loading. + +In this case, the user daemon can employ several strategies to improve +performance, such as: concurrent loading, larger I/O size for read-ahead, numa +aware allocation and trace-based prefetching triggered by MISS events. + +- Tracing the reading io. + +In this case, the user daemon can log the MISS io during running the process by +parsing `offset` and `length` in the message. diff --git a/MAINTAINERS b/MAINTAINERS index 33eeabab5088760084dd54e1e562ef8594a5d0aa..927e9f6aa9ce8d7ad2e5edc41acc98c4ca88b8d1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7762,6 +7762,16 @@ F: Documentation/filesystems/erofs.rst F: fs/erofs/ F: include/trace/events/erofs.h +MFS FILE SYSTEM +M: Hongbo Li +M: Xiaojia Huang +L: linux-fsdevel@vger.kernel.org +S: Maintained +F: Documentation/filesystems/mfs.rst +F: fs/mfs/ +F: include/trace/events/mfs.h +F: include/uapi/linux/mfs.h + ERRSEQ ERROR TRACKING INFRASTRUCTURE M: Jeff Layton S: Maintained diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9902feacdb006812126ae048946c44b6e34d906a..0f119402a78e1bf943b65b582365b050b9d42551 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7245,6 +7245,7 @@ CONFIG_EROFS_FS_POSIX_ACL=y CONFIG_EROFS_FS_SECURITY=y # CONFIG_EROFS_FS_ZIP is not set CONFIG_EROFS_FS_ONDEMAND=y +CONFIG_MFS_FS=y CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=m CONFIG_NFS_V2=m diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b5beb81300d26cbb27777cd7f23a34851cfcbe06..38d79af54ec124e843b74d2a919ad6bfe9cb5329 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8425,6 +8425,7 @@ CONFIG_EROFS_FS_POSIX_ACL=y CONFIG_EROFS_FS_SECURITY=y # CONFIG_EROFS_FS_ZIP is not set CONFIG_EROFS_FS_ONDEMAND=y +CONFIG_MFS_FS=y CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=m # CONFIG_NFS_V2 is not set diff --git a/fs/Kconfig b/fs/Kconfig index 92bcdc29e6a88eec55ac8734f75776385690f1ba..6657ada03d066659d4ffd266cf55b17948304e1b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -370,6 +370,7 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/erofs/Kconfig" source "fs/vboxsf/Kconfig" +source "fs/mfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 81428bad22f072162bff406eeae4acca1ec2316d..fa1e3d0678398064138d7c8a0e13f408435676c4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -136,3 +136,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ obj-$(CONFIG_RESCTRL_FS) += resctrl/ +obj-$(CONFIG_MFS_FS) += mfs/ \ No newline at end of file diff --git a/fs/mfs/Kconfig b/fs/mfs/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..26c336a3056e7e2cbfb332a474c9fba1eef41b9a --- /dev/null +++ b/fs/mfs/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config MFS_FS + tristate "MFS filesystem support" + help + MFS provides a cache-programmable ability for users. + It is a stackable file system, and will post miss events + during reading data which is not in cache (memory or disk). + Based on these events, users can trigger the target io for each + application. + + If unsure, say N. diff --git a/fs/mfs/Makefile b/fs/mfs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..68c090fb4bc3b26146db362cc15575690ad05b85 --- /dev/null +++ b/fs/mfs/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_MFS_FS) += mfs.o +mfs-objs := super.o inode.o data.o dev.o cache.o diff --git a/fs/mfs/cache.c b/fs/mfs/cache.c new file mode 100644 index 0000000000000000000000000000000000000000..2675fe58b781f1a46b20afb778582bc2afcd0261 --- /dev/null +++ b/fs/mfs/cache.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +#include +#include +#include +#include +#include + +#include + +/* + * Used for cache object + */ +static struct kmem_cache *mfs_cobject_cachep; + +static int fd_release(struct inode *inode, struct file *file) +{ + struct mfs_cache_object *object = file->private_data; + + down_write(&object->rwsem); + if (object->fd > 0) { + object->fd = -1; + object->anon_file = NULL; + iput(object->mfs_inode); + } + up_write(&object->rwsem); + return 0; +} + +static ssize_t fd_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *ori_file = iocb->ki_filp; + struct mfs_cache_object *object = ori_file->private_data; + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + ssize_t ret; + + if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags)) + return -EINVAL; + if (sbi->mode != MFS_MODE_REMOTE) + return -EOPNOTSUPP; + + iocb->ki_filp = object->cache_file; + ret = vfs_iocb_iter_write(object->cache_file, iocb, iter); + iocb->ki_filp = ori_file; + return ret; +} + +static loff_t fd_llseek(struct file *filp, loff_t pos, int whence) +{ + struct mfs_cache_object *object = filp->private_data; + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + + if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags)) + return -EINVAL; + if (sbi->mode != MFS_MODE_REMOTE) + return -EOPNOTSUPP; + + return vfs_llseek(object->cache_file, pos, whence); +} + +/* Used for sync events */ +static long _ioc_done(struct mfs_cache_object *object, + struct mfs_ioc_done *done) +{ + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + struct mfs_caches *caches = &sbi->caches; + XA_STATE(xas, &caches->events, done->id); + struct mfs_syncer *syncer; + struct mfs_event *event; + + xas_lock(&xas); + event = xas_load(&xas); + if (!event || event->object != object) { + xa_unlock(&caches->events); + return -EINVAL; + } + xas_store(&xas, NULL); + syncer = event->syncer; + if (done->ret) + atomic_cmpxchg(&syncer->res, 0, -EIO); + spin_lock(&syncer->list_lock); + list_del(&event->link); + spin_unlock(&syncer->list_lock); + if (atomic_dec_return(&syncer->notback) == 0) + complete(&syncer->done); + xas_unlock(&xas); + + put_mfs_event(event); + return 0; +} + +static void force_ra(struct address_space *mapping, struct file *file, + pgoff_t start, pgoff_t end) +{ + unsigned long default_pages = (4 * 1024 * 1024) / PAGE_SIZE; + DEFINE_READAHEAD(ractl, file, NULL, mapping, start); + pgoff_t index = start; + unsigned long nr_to_read; + + nr_to_read = end - start + 1; + while (nr_to_read) { + if (default_pages > nr_to_read) + default_pages = nr_to_read; + if (index > end) + return; + ractl._index = index; + page_cache_ra_unbounded(&ractl, default_pages, 0); + index += default_pages; + nr_to_read -= default_pages; + } +} + +/* Used for async events */ +static long _ioc_ra(struct mfs_cache_object *object, + struct mfs_ioc_ra *ra) +{ + struct file *file = object->cache_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file_inode(file); + loff_t endbyte, isize; + pgoff_t start, end; + + isize = i_size_read(inode); + if (!isize) + return 0; + if (ra->off >= isize) + return -EINVAL; + endbyte = (u64)ra->off + (u64)ra->len; + if (!ra->len || endbyte < ra->len) + endbyte = LLONG_MAX; + else + endbyte--; + endbyte = min_t(loff_t, endbyte, isize); + + start = ra->off >> PAGE_SHIFT; + end = endbyte >> PAGE_SHIFT; + + force_ra(mapping, file, start, end); + return 0; +} + +static long fd_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct mfs_cache_object *object = filp->private_data; + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + int ret = 0; + + if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags)) + return -EINVAL; + + switch (cmd) { + case MFS_IOC_DONE: + { + struct mfs_ioc_done done; + + if (sbi->mode != MFS_MODE_REMOTE) + return -EOPNOTSUPP; + if (copy_from_user(&done, (void __user *)arg, sizeof(done))) + return -EFAULT; + ret = _ioc_done(object, &done); + break; + } + case MFS_IOC_RA: + { + struct mfs_ioc_ra ra; + + if (sbi->mode != MFS_MODE_LOCAL) + return -EOPNOTSUPP; + if (copy_from_user(&ra, (void __user *)arg, sizeof(ra))) + return -EFAULT; + ret = _ioc_ra(object, &ra); + break; + } + case MFS_IOC_RPATH: + { + struct mfs_ioc_rpath __user *ua = (struct mfs_ioc_rpath __user *)arg; + struct mfs_ioc_rpath *rpath; + int plen, clen; + u32 bytes; + char *p; + + if (get_user(bytes, &ua->max)) + return -EFAULT; + rpath = kzalloc(bytes + sizeof(struct mfs_ioc_rpath), GFP_KERNEL); + if (!rpath) + return -ENOMEM; + + rpath->max = bytes; + p = file_path(object->cache_file, rpath->d, rpath->max); + if (IS_ERR(p)) { + kfree(rpath); + return PTR_ERR(p); + } + plen = strlen(p), clen = strlen(sbi->cachedir); + if (plen <= clen) { + kfree(rpath); + return -EFAULT; + } + rpath->len = plen - clen; + /* include the tailing nil */ + memmove(rpath->d, p + clen, rpath->len + 1); + if (copy_to_user((void __user *)arg, rpath, + rpath->len + 1 + sizeof(struct mfs_ioc_rpath))) + ret = -EFAULT; + kfree(rpath); + break; + } + default: + return -EINVAL; + } + return ret; +} + +static const struct file_operations mfs_fd_fops = { + .owner = THIS_MODULE, + .release = fd_release, + .write_iter = fd_write_iter, + .llseek = fd_llseek, + .unlocked_ioctl = fd_ioctl, +}; + +static int mfs_setup_object(struct mfs_cache_object *object, + struct inode *inode, + struct path *cache_path) +{ + struct inode *cache_inode = d_inode(cache_path->dentry); + struct file *cache_file; + int flags = O_RDONLY; + + if (need_sync_event(inode->i_sb)) + flags = O_RDWR; + cache_file = kernel_file_open(cache_path, flags | O_LARGEFILE, + cache_inode, current_cred()); + if (IS_ERR(cache_file)) + return PTR_ERR(cache_file); + /* + * object belongs to a mfs inode, + * this is a reverse pointer, no refcount needed. + */ + object->mfs_inode = inode; + object->cache_file = cache_file; + init_rwsem(&object->rwsem); + object->fd = -1; + object->anon_file = NULL; + return 0; +} + +struct mfs_event *mfs_pick_event(struct xa_state *xas, + unsigned long xa_max) +{ + struct mfs_event *event; + + xas_for_each_marked(xas, event, xa_max, MFS_EVENT_NEW) { + return event; + } + + return NULL; +} + +void mfs_post_event_read(struct mfs_cache_object *object, + loff_t off, uint64_t len, + struct mfs_syncer *syncer, int op) +{ + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + struct mfs_caches *caches = &sbi->caches; + XA_STATE(xas, &caches->events, 0); + struct mfs_event *event; + struct mfs_read *msg; + int ret; + + /* 1. init event struct */ + event = kzalloc(sizeof(*event) + sizeof(*msg), GFP_KERNEL); + if (!event) { + pr_warn("post read event failed, off:%lld, len:%llu\n", off, len); + return; + } + + /* 2. hold object's owner mfs_inode */ + ihold(object->mfs_inode); + trace_mfs_post_event_read(object->mfs_inode, off, len, op); + refcount_set(&event->ref, 1); + event->object = object; + event->msg.version = 0; + event->msg.opcode = op; + event->msg.len = sizeof(struct mfs_msg) + sizeof(struct mfs_read); + event->msg.fd = object->fd; + msg = (void *)event->msg.data; + msg->off = off; + msg->len = len; + msg->pid = current->pid; + INIT_LIST_HEAD(&event->link); + event->syncer = syncer; + if (event->syncer) { + atomic_inc(&syncer->notback); + spin_lock(&syncer->list_lock); + list_add_tail(&event->link, &syncer->head); + spin_unlock(&syncer->list_lock); + } + + /* 3. put event into reqs' xarray */ + do { + xas_lock(&xas); + + if (!test_bit(MFS_CACHE_READY, &caches->flags)) { + xas_unlock(&xas); + goto out; + } + + /* Ensure cache enabled judgement before posting events */ + smp_mb__after_atomic(); + + xas.xa_index = caches->next_msg; + xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) { + xas.xa_index = 0; + xas_find_marked(&xas, caches->next_msg - 1, XA_FREE_MARK); + } + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -EBUSY); + xas_store(&xas, event); + if (xas_valid(&xas)) { + caches->next_msg = xas.xa_index + 1; + event->msg.id = xas.xa_index; + xas_clear_mark(&xas, XA_FREE_MARK); + xas_set_mark(&xas, MFS_EVENT_NEW); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + ret = xas_error(&xas); + if (ret) { + pr_warn("post read event failed to insert events, off:%lld, len:%llu, ret:%d\n", + off, len, ret); + goto out; + } + + /* 3. wakeup the polling wait list */ + wake_up_all(&caches->pollwq); + return; +out: + if (event->syncer) { + spin_lock(&syncer->list_lock); + list_del_init(&event->link); + spin_unlock(&syncer->list_lock); + atomic_dec(&syncer->notback); + } + kfree(event); + iput(object->mfs_inode); +} + +void mfs_destroy_events(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + struct mfs_caches *caches = &sbi->caches; + unsigned long index; + struct mfs_event *event; + + xa_lock(&caches->events); + xa_for_each(&caches->events, index, event) { + /* + * Inodes will be evicted before destroy events. + * Hence there should be none of events. + */ + pr_warn("Event remains:%lu\n", index); + __xa_erase(&caches->events, index); + put_mfs_event(event); + } + xa_unlock(&caches->events); + xa_destroy(&caches->events); +} + +void mfs_cancel_syncer_events(struct mfs_cache_object *object, + struct mfs_syncer *syncer) +{ + struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb); + struct mfs_caches *caches = &sbi->caches; + struct xarray *xa = &caches->events; + struct mfs_event *event, *nevent; + + xa_lock(xa); + spin_lock(&syncer->list_lock); + list_for_each_entry_safe(event, nevent, &syncer->head, link) { + __xa_erase(&caches->events, event->msg.id); + list_del(&event->link); + put_mfs_event(event); + } + spin_unlock(&syncer->list_lock); + xa_unlock(xa); +} + +void mfs_cancel_all_events(struct mfs_sb_info *sbi) +{ + struct mfs_caches *caches = &sbi->caches; + struct xarray *xa = &caches->events; + struct mfs_syncer *syncer; + struct mfs_event *event; + unsigned long index; + + while (!xa_empty(xa)) { + xa_lock(xa); + xa_for_each(xa, index, event) { + __xa_erase(xa, index); + syncer = event->syncer; + /* + * Here should keep syncer (a stack variable), so we should + * wakeup the syncer list in the protect of xa lock. + */ + if (syncer) { + spin_lock(&syncer->list_lock); + list_del(&event->link); + spin_unlock(&syncer->list_lock); + if (atomic_dec_return(&syncer->notback) == 0) { + atomic_cmpxchg(&syncer->res, 0, -EIO); + complete(&syncer->done); + } + } + put_mfs_event(event); + if (need_resched()) + break; + } + xa_unlock(xa); + cond_resched(); + } + caches->next_ev = 0; + caches->next_msg = 0; +} + +int try_hook_fd(struct mfs_event *event) +{ + struct mfs_cache_object *object = event->object; + struct file *anon_file; + int fd; + + down_read(&object->rwsem); + if (object->fd > 0) { + up_read(&object->rwsem); + return object->fd; + } + up_read(&object->rwsem); + down_write(&object->rwsem); + fd = get_unused_fd_flags(O_WRONLY); + if (fd < 0) { + up_write(&object->rwsem); + return fd; + } + + anon_file = anon_inode_getfile("[mfs]", &mfs_fd_fops, object, O_WRONLY); + if (IS_ERR(anon_file)) { + put_unused_fd(fd); + up_write(&object->rwsem); + return PTR_ERR(anon_file); + } + anon_file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; + object->fd = fd; + object->anon_file = anon_file; + /* lifecyle of fd/anon_file should later than mfs_inode */ + ihold(object->mfs_inode); + fd_install(fd, anon_file); + up_write(&object->rwsem); + return fd; +} + +struct mfs_cache_object *mfs_alloc_object(struct inode *inode, + struct path *cache_path) +{ + struct mfs_cache_object *object; + int err; + + object = kmem_cache_alloc(mfs_cobject_cachep, GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + err = mfs_setup_object(object, inode, cache_path); + if (err) { + kmem_cache_free(mfs_cobject_cachep, object); + return ERR_PTR(err); + } + + return object; +} + +void mfs_free_object(void *data) +{ + struct mfs_cache_object *object = (struct mfs_cache_object *)data; + + fput(object->cache_file); + kmem_cache_free(mfs_cobject_cachep, object); +} + +int mfs_cache_init(void) +{ + mfs_cobject_cachep = + kmem_cache_create("mfs_object", + sizeof(struct mfs_cache_object), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!mfs_cobject_cachep) + return -ENOMEM; + return 0; +} + +void mfs_cache_exit(void) +{ + kmem_cache_destroy(mfs_cobject_cachep); +} diff --git a/fs/mfs/data.c b/fs/mfs/data.c new file mode 100644 index 0000000000000000000000000000000000000000..32d0b6947aa87336f121163024b5c5fb002c5be4 --- /dev/null +++ b/fs/mfs/data.c @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +#include +#include +#include + +#include + +static struct mfs_file_info *mfs_file_info_alloc(struct file *lower, struct file *cache) +{ + struct mfs_file_info *info = kzalloc(sizeof(struct mfs_file_info), GFP_KERNEL); + + if (unlikely(!info)) + return NULL; + + info->lower = lower; + info->cache = cache; + return info; +} + +static void mfs_file_info_free(struct mfs_file_info *info) +{ + fput(info->cache); + fput(info->lower); + kfree(info); +} + +static int mfs_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file_dentry(file); + struct mfs_sb_info *sbi = MFS_SB(inode->i_sb); + struct path lpath, cpath; + struct file *lfile, *cfile; + int flags = file->f_flags | MFS_OPEN_FLAGS; + struct mfs_file_info *file_info; + int err = 0; + + trace_mfs_open(inode, file); + mfs_get_path(dentry, &lpath, &cpath); + lfile = dentry_open(&lpath, flags, current_cred()); + if (IS_ERR(lfile)) { + err = PTR_ERR(lfile); + goto put_path; + } + + cfile = dentry_open(&cpath, flags, current_cred()); + if (IS_ERR(cfile)) { + err = PTR_ERR(cfile); + goto lfput; + } + + if (support_event(sbi)) + /* close the default readahead */ + cfile->f_mode |= FMODE_RANDOM; + file_info = mfs_file_info_alloc(lfile, cfile); + if (!file_info) { + err = -ENOMEM; + goto cfput; + } + + file->private_data = file_info; + goto put_path; +cfput: + fput(cfile); +lfput: + fput(lfile); +put_path: + mfs_put_path(&lpath, &cpath); + return err; +} + +static int mfs_release(struct inode *inode, struct file *file) +{ + trace_mfs_release(inode, file); + mfs_file_info_free(file->private_data); + return 0; +} + +static loff_t mfs_llseek(struct file *file, loff_t offset, int whence) +{ + struct mfs_file_info *file_info = file->private_data; + struct inode *inode = file_inode(file); + struct file *lfile, *cfile; + loff_t ret; + + if (offset == 0) { + if (whence == SEEK_CUR) + return file->f_pos; + + if (whence == SEEK_SET) + return vfs_setpos(file, 0, 0); + } + + lfile = file_info->lower; + cfile = file_info->cache; + + mfs_inode_lock(inode); + lfile->f_pos = file->f_pos; + ret = vfs_llseek(lfile, offset, whence); + if (ret < 0) + goto out; + + cfile->f_pos = file->f_pos; + ret = vfs_llseek(cfile, offset, whence); + if (ret < 0) + goto out; + + file->f_pos = lfile->f_pos; +out: + mfs_inode_unlock(inode); + return ret; +} + +static int mfs_flush(struct file *file, fl_owner_t id) +{ + struct mfs_file_info *file_info = file->private_data; + struct file *cfile; + int err = 0; + + cfile = file_info->cache; + if (cfile->f_op->flush) + err = cfile->f_op->flush(cfile, id); + + return err; +} + +static int mfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct file *lfile; + struct mfs_file_info *file_info = file->private_data; + + lfile = file_info->lower; + return iterate_dir(lfile, ctx); +} + +enum range_status { + RANGE_DATA, + RANGE_HOLE, + RANGE_INVAL, +}; + +/* Continuous range with same status */ +struct range_t { + struct file *file; + loff_t off; + size_t max; + size_t len; + int status; +}; + +typedef int (*range_check) (struct range_t *r); + +struct range_ctx { + bool sync; /* handle the miss case in sync/async way */ + int op; + loff_t off; + size_t len; + struct file *file; + struct mfs_cache_object *object; + range_check checker; /* check method for range */ +}; + +static int range_check_disk(struct range_t *r) +{ + loff_t off, to, start = r->off, end = r->off + r->max; + struct file *file = r->file; + int err = 0; + + off = vfs_llseek(file, start, SEEK_DATA); + if (off < 0) { + if (off == (loff_t)-ENXIO) { + r->len = end - start; + r->status = RANGE_HOLE; + goto out; + } + err = (int)off; + goto out; + } + if (off >= end) { + r->len = end - start; + r->status = RANGE_HOLE; + goto out; + } + if (off > start) { + r->len = end - off; + r->status = RANGE_HOLE; + goto out; + } + to = vfs_llseek(file, start, SEEK_HOLE); + if (to < 0) { + err = (int)to; + goto out; + } + if (to < end) { + r->len = to - start; + r->status = RANGE_DATA; + goto out; + } + r->len = end - start; + r->status = RANGE_DATA; +out: + return err; +} + +static int range_check_mem(struct range_t *r) +{ + struct inode *inode = file_inode(r->file); + struct address_space *mapping = inode->i_mapping; + loff_t cur_off = r->off, end = r->off + r->max; + struct folio *folio; + + /* check from the first folio */ + folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT); + if (IS_ERR(folio)) { + r->status = RANGE_HOLE; + cur_off += PAGE_SIZE; + } else { + r->status = RANGE_DATA; + cur_off += folio_size(folio); + folio_put(folio); + } + + while (cur_off < end) { + folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT); + if (IS_ERR(folio)) { + if (r->status == RANGE_DATA) + break; + /* continuous hole */ + cur_off += PAGE_SIZE; + continue; + } + if (r->status == RANGE_HOLE) { + folio_put(folio); + break; + } + cur_off += folio_size(folio); + folio_put(folio); + } + + r->len = cur_off - r->off; + return 0; +} + +static int mfs_check_range(struct range_ctx *ctx) +{ + struct mfs_sb_info *sbi = MFS_SB(ctx->object->mfs_inode->i_sb); + loff_t start = ctx->off, end = ctx->off + ctx->len; + struct file *file = ctx->file; + struct range_t r = { .file = file }; + size_t len = ctx->len; + struct mfs_syncer syncer; + int err = 0, err2 = 0; + + if (!support_event(sbi)) + return 0; + if (!cache_is_ready(sbi)) + return ctx->sync ? -EIO : 0; + if (!ctx->len) + return 0; + + atomic_set(&syncer.notback, 1); + init_completion(&syncer.done); + INIT_LIST_HEAD(&syncer.head); + spin_lock_init(&syncer.list_lock); + atomic_set(&syncer.res, 0); + while (start < end) { + r.off = round_down(start, PAGE_SIZE); + r.max = len + (start - r.off); + r.len = 0; + r.status = RANGE_INVAL; + err = ctx->checker(&r); + if (err) + goto err; + switch (r.status) { + case RANGE_DATA: + start += r.len; + len -= r.len; + break; + case RANGE_HOLE: + start += r.len; + len -= r.len; + if (ctx->sync) + mfs_post_event_read(ctx->object, r.off, r.len, &syncer, ctx->op); + else + mfs_post_event_read(ctx->object, r.off, r.len, NULL, ctx->op); + break; + default: + pr_warn("invalid range status:%d\n", r.status); + WARN_ON_ONCE(1); + err = -EINVAL; + goto err; + } + } + +err: + if (atomic_dec_return(&syncer.notback) > 0) { + err2 = wait_for_completion_interruptible(&syncer.done); + if (err2) + mfs_cancel_syncer_events(ctx->object, &syncer); + else + err = atomic_read(&syncer.res); + } + return err ?: err2; +} + +static ssize_t mfs_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *cfile, *file = iocb->ki_filp; + struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); + struct range_ctx ctx; + ssize_t rsize; + int err; + + if (!iov_iter_count(to)) + return 0; + + cfile = fi->cache; + if (!cfile->f_op->read_iter) + return -EINVAL; + + (void)get_file(cfile); + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = iocb->ki_pos; + ctx.op = MFS_OP_READ; + ctx.len = min_t(size_t, isize - ctx.off, iov_iter_count(to)); + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { + fput(cfile); + return err; + } + + iocb->ki_filp = cfile; + rsize = cfile->f_op->read_iter(iocb, to); + iocb->ki_filp = file; + fput(cfile); + return rsize; +} + +static int mfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct mfs_file_info *fi = file->private_data; + struct file *cfile = fi->cache; + int err; + + if (!cfile->f_op->mmap) + return -ENODEV; + + (void)get_file(cfile); + vma->vm_file = cfile; + err = call_mmap(vma->vm_file, vma); + vma->vm_file = file; + fput(cfile); + if (err) + return err; + + fi->cache_vm_ops = vma->vm_ops; + vma->vm_ops = &mfs_file_vm_ops; + + return 0; +} + +static vm_fault_t mfs_filemap_fault(struct vm_fault *vmf) +{ + struct file *cfile, *file = vmf->vma->vm_file; + struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); + const struct vm_operations_struct *cvm_ops; + struct vm_area_struct cvma, *vma, **vma_; + struct range_ctx ctx; + vm_fault_t ret; + int err; + + vma = vmf->vma; + memcpy(&cvma, vma, sizeof(struct vm_area_struct)); + cfile = fi->cache; + cvm_ops = fi->cache_vm_ops; + cvma.vm_file = cfile; + + if (unlikely(!cvm_ops->fault)) + return VM_FAULT_SIGBUS; + if ((vmf->pgoff << PAGE_SHIFT) >= isize) + return VM_FAULT_SIGBUS; + + (void)get_file(cfile); + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = vmf->pgoff << PAGE_SHIFT; + ctx.len = min_t(size_t, isize - ctx.off, PAGE_SIZE); + ctx.op = MFS_OP_FAULT; + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { + fput(cfile); + return VM_FAULT_SIGBUS; + } + + /* + * Dealing fault in mfs will call cachefile's fault eventually, + * hence we will change vmf->vma->vm_file to cachefile. + * When faulting concurrently, changing vmf->vma->vm_file is + * visible to other threads. Hence we use cvma to narrow the + * visibility. vmf->vma is const, so we use **vma_ to change. + */ + vma_ = (struct vm_area_struct **)&vmf->vma; + *vma_ = &cvma; + ret = cvm_ops->fault(vmf); + *vma_ = vma; + fput(cfile); + return ret; +} + +static vm_fault_t mfs_filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct file *cfile, *file = vmf->vma->vm_file; + struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); + const struct vm_operations_struct *cvm_ops; + struct vm_area_struct cvma, *vma, **vma_; + struct range_ctx ctx; + vm_fault_t ret; + int err; + + vma = vmf->vma; + memcpy(&cvma, vma, sizeof(struct vm_area_struct)); + cfile = fi->cache; + cvm_ops = fi->cache_vm_ops; + cvma.vm_file = cfile; + + if (unlikely(!cvm_ops->map_pages)) + return 0; + if ((start_pgoff << PAGE_SHIFT) >= isize) + return 0; + + (void)get_file(cfile); + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = start_pgoff << PAGE_SHIFT; + ctx.len = min_t(size_t, isize - ctx.off, (end_pgoff - start_pgoff) << PAGE_SHIFT); + ctx.op = MFS_OP_FAROUND; + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { + fput(cfile); + return 0; + } + + vma_ = (struct vm_area_struct **)&vmf->vma; + *vma_ = &cvma; + ret = cvm_ops->map_pages(vmf, start_pgoff, end_pgoff); + *vma_ = vma; + fput(cfile); + + return ret; +} + +static int mfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct inode *inode = file_inode(file); + struct mfs_sb_info *sbi = MFS_SB(inode->i_sb); + struct mfs_file_info *fi; + struct file *cfile; + int ret; + + /* avoid trigger readahead in event mode */ + if (support_event(sbi)) + return generic_fadvise(file, offset, len, advice); + + fi = file->private_data; + cfile = fi->cache; + (void)get_file(cfile); + + ret = vfs_fadvise(cfile, offset, len, advice); + fput(cfile); + + return ret; +} + +const struct file_operations mfs_dir_fops = { + .open = mfs_open, + .iterate_shared = mfs_readdir, + .release = mfs_release, +}; + +const struct file_operations mfs_file_fops = { + .open = mfs_open, + .release = mfs_release, + .llseek = mfs_llseek, + .read_iter = mfs_read_iter, + .flush = mfs_flush, + .mmap = mfs_file_mmap, + .fadvise = mfs_fadvise, +}; + +const struct vm_operations_struct mfs_file_vm_ops = { + .fault = mfs_filemap_fault, + .map_pages = mfs_filemap_map_pages, +}; + +const struct address_space_operations mfs_aops = { + .direct_IO = noop_direct_IO, +}; diff --git a/fs/mfs/dev.c b/fs/mfs/dev.c new file mode 100644 index 0000000000000000000000000000000000000000..902f73b1c25c5c57ad968f9dae8381c50d3048b5 --- /dev/null +++ b/fs/mfs/dev.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +#include +#include +#include +#include + +#include + +static DEFINE_MUTEX(mfs_dev_lock); +static DEFINE_IDR(mfs_dev_minor); + +static int mfs_dev_major; +static const struct class mfs_dev_class = { + .name = "mfs", +}; +static struct device *mfs_dev; + +static int mfs_dev_open(struct inode *inode, struct file *file) +{ + struct mfs_caches *caches; + struct mfs_sb_info *sbi; + unsigned minor = iminor(inode); + + sbi = minor < U8_MAX ? idr_find(&mfs_dev_minor, minor) : NULL; + if (!sbi) + return -EOPNOTSUPP; + caches = &sbi->caches; + if (test_and_set_bit(MFS_CACHE_OPENED, &caches->flags)) + return -EBUSY; + smp_mb__after_atomic(); + + /* not mounted or cleaned by umounting */ + if (!test_bit(MFS_MOUNTED, &sbi->flags)) { + clear_bit(MFS_CACHE_OPENED, &caches->flags); + return -EBUSY; + } + + file->private_data = sbi; + set_bit(MFS_CACHE_READY, &caches->flags); + return 0; +} + +static int mfs_dev_release(struct inode *inode, struct file *file) +{ + struct mfs_sb_info *sbi = file->private_data; + struct mfs_caches *caches = &sbi->caches; + + clear_bit(MFS_CACHE_READY, &caches->flags); + smp_mb__after_atomic(); + mfs_cancel_all_events(sbi); + smp_mb__before_atomic(); + clear_bit(MFS_CACHE_OPENED, &caches->flags); + return 0; +} + +static ssize_t mfs_dev_read(struct file *file, char __user *buf, + size_t blen, loff_t *off) +{ + struct mfs_sb_info *sbi = file->private_data; + struct mfs_caches *caches = &sbi->caches; + XA_STATE(xas, &caches->events, caches->next_ev); + struct mfs_event *event; + struct mfs_msg *msg; + size_t n; + int ret = 0; + + xas_lock(&xas); + event = mfs_pick_event(&xas, ULONG_MAX); + if (!event && caches->next_ev > 0) { + xas_set(&xas, 0); + event = mfs_pick_event(&xas, caches->next_ev - 1); + } + if (!event) { + xas_unlock(&xas); + return 0; + } + if (event->syncer) + get_mfs_event(event); + xas_unlock(&xas); + + msg = &event->msg; + n = msg->len; + if (n > blen) { + ret = -EMSGSIZE; + goto out; + } + + ret = try_hook_fd(event); + if (ret < 0) + goto out; + + msg->fd = ret; + ret = 0; + if (copy_to_user(buf, msg, n)) { + ret = -EFAULT; + goto out; + } + xas_lock(&xas); + xas_clear_mark(&xas, MFS_EVENT_NEW); + caches->next_ev = xas.xa_index + 1; + if (!event->syncer) + xas_store(&xas, NULL); + xas_unlock(&xas); +out: + put_mfs_event(event); + trace_mfs_dev_read(file, msg->opcode, msg->id, msg->fd); + return ret ? ret : n; +} + +static __poll_t mfs_dev_poll(struct file *file, + struct poll_table_struct *poll) +{ + struct mfs_sb_info *sbi = file->private_data; + struct mfs_caches *caches = &sbi->caches; + struct mfs_event *event; + XA_STATE(xas, &caches->events, 0); + __poll_t mask; + + poll_wait(file, &caches->pollwq, poll); + mask = 0; + + if (!xa_empty(&caches->events)) { + xas_lock(&xas); + xas_for_each_marked(&xas, event, ULONG_MAX, MFS_EVENT_NEW) { + mask |= EPOLLIN; + break; + } + xas_unlock(&xas); + } + + return mask; +} + +static long mfs_dev_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct mfs_ioc_fsinfo fsinfo; + unsigned minor = iminor(file_inode(filp)); + struct mfs_sb_info *sbi = minor < U8_MAX ? + idr_find(&mfs_dev_minor, minor) : NULL; + if (!sbi) + return -EOPNOTSUPP; + + if (cmd != MFS_IOC_FSINFO) + return -EINVAL; + if (!test_bit(MFS_MOUNTED, &sbi->flags)) + return -EBUSY; + + fsinfo.mode = sbi->mode; + if (copy_to_user((void __user *)arg, &fsinfo, + sizeof(struct mfs_ioc_fsinfo))) + return -EFAULT; + return 0; +} + +static const struct file_operations mfs_dev_fops = { + .owner = THIS_MODULE, + .open = mfs_dev_open, + .release = mfs_dev_release, + .read = mfs_dev_read, + .poll = mfs_dev_poll, + .unlocked_ioctl = mfs_dev_ioctl, +}; + +int mfs_fs_dev_init(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + struct device *dev; + + mutex_lock(&mfs_dev_lock); + sbi->minor = idr_alloc(&mfs_dev_minor, sbi, 0, U8_MAX, GFP_KERNEL); + if (sbi->minor < 0) { + mutex_unlock(&mfs_dev_lock); + return sbi->minor; + } + + dev = device_create(&mfs_dev_class, NULL, + MKDEV(mfs_dev_major, sbi->minor), sbi, + "mfs%u", sbi->minor); + if (IS_ERR(dev)) { + idr_remove(&mfs_dev_minor, sbi->minor); + sbi->minor = -1; + mutex_unlock(&mfs_dev_lock); + return PTR_ERR(dev); + } + mutex_unlock(&mfs_dev_lock); + return 0; +} + +void mfs_fs_dev_exit(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + + if (sbi->minor < 0) + return; + mutex_lock(&mfs_dev_lock); + device_destroy(&mfs_dev_class, MKDEV(mfs_dev_major, sbi->minor)); + idr_remove(&mfs_dev_minor, sbi->minor); + mutex_unlock(&mfs_dev_lock); + sbi->minor = -1; +} + +int mfs_dev_init(void) +{ + int ret; + + mfs_dev_major = register_chrdev(0, "mfs-ctl", &mfs_dev_fops); + if (mfs_dev_major < 0) + return mfs_dev_major; + + ret = class_register(&mfs_dev_class); + if (ret) + goto major_out; + + mfs_dev = device_create(&mfs_dev_class, NULL, + MKDEV(mfs_dev_major, U8_MAX), + NULL, "mfs-ctl"); + if (IS_ERR(mfs_dev)) { + ret = PTR_ERR(mfs_dev); + goto class_out; + } + return 0; + +class_out: + class_unregister(&mfs_dev_class); +major_out: + unregister_chrdev(mfs_dev_major, "mfs-ctl"); + return ret; +} + +void mfs_dev_exit(void) +{ + if (!IS_ERR_OR_NULL(mfs_dev)) + device_destroy(&mfs_dev_class, MKDEV(mfs_dev_major, U8_MAX)); + class_unregister(&mfs_dev_class); + if (mfs_dev_major > 0) + unregister_chrdev(mfs_dev_major, "mfs-ctl"); +} diff --git a/fs/mfs/inode.c b/fs/mfs/inode.c new file mode 100644 index 0000000000000000000000000000000000000000..e45a5da7a67cb5ab4eedd1f51310083405a953f7 --- /dev/null +++ b/fs/mfs/inode.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +#include +#include +#include + +#include + +static int mfs_inode_eq(struct inode *inode, void *lower_target) +{ + return mfs_lower_inode(inode) == (struct inode *)lower_target; +} + +static int mfs_inode_set(struct inode *inode, void *lower_target) +{ + return 0; +} + +static struct inode *_mfs_get_inode(struct super_block *sb, + struct path *lower_path, + struct path *cache_path) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + struct inode *ret, *lower_inode, *cache_inode; + + lower_inode = d_inode(lower_path->dentry); + cache_inode = d_inode(cache_path->dentry); + + /* lower file system cannot change */ + if (lower_inode->i_sb != sbi->lower.dentry->d_sb) { + ret = ERR_PTR(-EXDEV); + goto out; + } + + /* check consistency: mode and size */ + if ((lower_inode->i_mode & S_IFMT) != (cache_inode->i_mode & S_IFMT)) { + ret = ERR_PTR(-EUCLEAN); + goto out; + } + if (S_ISREG(lower_inode->i_mode) + && lower_inode->i_size != cache_inode->i_size) { + ret = ERR_PTR(-EUCLEAN); + goto out; + } + + /* allocate new inode for mfs */ + ret = mfs_iget(sb, lower_inode, cache_path); +out: + return ret; +} + +static int _lookup_create(struct path *lpath, struct path *parent_cpath, + const char *name, struct path *cpath) +{ + struct dentry *ldentry, *parent_cdentry, *dentry; + struct inode *linode, *cdir; + int ret = 0, _ret; + + ldentry = lpath->dentry; + parent_cdentry = parent_cpath->dentry; + linode = d_inode(ldentry); + cdir = d_inode(parent_cpath->dentry); + + inode_lock_nested(cdir, I_MUTEX_PARENT); +retry: + dentry = lookup_one_len(name, parent_cdentry, strlen(name)); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + cpath->mnt = mntget(parent_cpath->mnt); + cpath->dentry = dentry; + if (d_is_positive(dentry)) + goto out; + + if (d_is_dir(ldentry)) { + ret = vfs_mkdir(&nop_mnt_idmap, cdir, dentry, linode->i_mode); + if (ret) + goto new_err; + /* + * In the event that the filesystem does not use the @dentry + * but leaves it negative or unhashes it. + */ + if (unlikely(d_unhashed(dentry))) { + mntput(parent_cpath->mnt); + dput(dentry); + goto retry; + } + } else { + /* dir or file, symlink will be considerred the regular file */ + ret = vfs_create(&nop_mnt_idmap, cdir, dentry, linode->i_mode, true); + if (ret) + goto new_err; + ret = vfs_truncate(cpath, linode->i_size); + if (ret) + goto truncate_err; + } + goto out; + +truncate_err: + _ret = vfs_unlink(&nop_mnt_idmap, cdir, dentry, NULL); + if (_ret) + pr_err("cleanup failed for file:%s, err:%d\n", name, _ret); +new_err: + mntput(parent_cpath->mnt); + dput(dentry); +out: + inode_unlock(cdir); + return ret; +} + +static struct dentry *mfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flag) +{ + struct path parent_lpath, parent_cpath, lpath, cpath; + struct dentry *ret, *parent; + struct inode *inode; + const char *name; + int err; + + trace_mfs_lookup(dir, dentry, flag); + parent = dget_parent(dentry); + mfs_get_path(parent, &parent_lpath, &parent_cpath); + err = mfs_alloc_dentry_info(dentry); + if (err) { + ret = ERR_PTR(err); + goto out; + } + /* lookup from lower layer */ + name = dentry->d_name.name; + err = vfs_path_lookup(parent_lpath.dentry, + parent_lpath.mnt, + name, 0, &lpath); + if (err) { + ret = ERR_PTR(err); + mfs_free_dentry_info(dentry); + goto out; + } + /* check from cache layer */ + err = vfs_path_lookup(parent_cpath.dentry, + parent_cpath.mnt, + name, 0, &cpath); + if (err) { + if (err != -ENOENT) { +cdentry_fail: + ret = ERR_PTR(err); + path_put(&lpath); + mfs_free_dentry_info(dentry); + goto out; + } + err = _lookup_create(&lpath, &parent_cpath, name, &cpath); + if (err) + goto cdentry_fail; + } + /* build the inode from lower layer */ + inode = _mfs_get_inode(dir->i_sb, &lpath, &cpath); + if (IS_ERR(inode)) { + path_put(&lpath); + path_put(&cpath); + mfs_free_dentry_info(dentry); + ret = ERR_PTR(PTR_ERR(inode)); + goto out; + } + mfs_install_path(dentry, &lpath, &cpath); + ret = d_splice_alias(inode, dentry); + if (IS_ERR(ret)) { + path_put(&lpath); + path_put(&cpath); + mfs_free_dentry_info(dentry); + } +out: + mfs_put_path(&parent_lpath, &parent_cpath); + dput(parent); + return ret; +} + +static int mfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct mfs_inode *vi = MFS_I(d_inode(path->dentry)); + + generic_fillattr(idmap, request_mask, vi->lower, stat); + return 0; +} + +static const char *mfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct mfs_sb_info *sbi = MFS_SB(inode->i_sb); + struct path lpath, cpath; + struct dentry *ldentry; + const char *p; + + if (!dentry) + return ERR_PTR(-ECHILD); + + mfs_get_path(dentry, &lpath, &cpath); + ldentry = lpath.dentry; + p = vfs_get_link(ldentry, done); + mfs_put_path(&lpath, &cpath); + + if (IS_ERR(p) || p[0] != '/') + return p; + if (strlen(p) <= strlen(sbi->mtree)) + return ERR_PTR(-EXDEV); + if (strncmp(sbi->mtree, p, strlen(sbi->mtree)) != 0) + return ERR_PTR(-EXDEV); + p += strlen(sbi->mtree); + if (p[0] != '/') + return ERR_PTR(-EXDEV); + p += 1; + return p; +} + +static const struct inode_operations mfs_dir_iops = { + .lookup = mfs_lookup, + .getattr = mfs_getattr, +}; + +static const struct inode_operations mfs_symlink_iops = { + .getattr = mfs_getattr, + .get_link = mfs_get_link, +}; + +static const struct inode_operations mfs_file_iops = { + .getattr = mfs_getattr, +}; + +struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode, + struct path *cache_path) +{ + struct inode *inode, *cache_inode = d_inode(cache_path->dentry); + struct mfs_inode *vi; + int err; + + if (!igrab(lower_inode)) + return ERR_PTR(-ESTALE); + if (!igrab(cache_inode)) { + err = -ESTALE; + goto err_put_lower; + } + inode = iget5_locked(sb, lower_inode->i_ino, + mfs_inode_eq, + mfs_inode_set, + lower_inode); + if (!inode) { + err = -ENOMEM; + goto err_put_cache; + } + /* found in cache */ + if (!(inode->i_state & I_NEW)) { + iput(cache_inode); + iput(lower_inode); + return inode; + } + /* new inode */ + vi = MFS_I(inode); + inode->i_ino = lower_inode->i_ino; + switch (lower_inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &mfs_file_iops; + inode->i_fop = &mfs_file_fops; + break; + case S_IFDIR: + inode->i_op = &mfs_dir_iops; + inode->i_fop = &mfs_dir_fops; + break; + case S_IFLNK: + inode->i_op = &mfs_symlink_iops; + break; + default: + err = -EOPNOTSUPP; + goto err_inode; + } + inode->i_mapping->a_ops = &mfs_aops; + if (S_ISREG(cache_inode->i_mode)) { + vi->vfs_inode.i_private = mfs_alloc_object(inode, cache_path); + if (IS_ERR(vi->vfs_inode.i_private)) { + err = PTR_ERR(vi->vfs_inode.i_private); + vi->vfs_inode.i_private = NULL; + goto err_inode; + } + } + vi->lower = lower_inode; + vi->cache = cache_inode; + fsstack_copy_attr_all(inode, lower_inode); + fsstack_copy_inode_size(inode, lower_inode); + unlock_new_inode(inode); + return inode; +err_inode: + iget_failed(inode); +err_put_cache: + iput(cache_inode); +err_put_lower: + iput(lower_inode); + return ERR_PTR(err); +} diff --git a/fs/mfs/internal.h b/fs/mfs/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..66c6c0a38c68354583903d49949fab52de6d075f --- /dev/null +++ b/fs/mfs/internal.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#ifndef _MFS_INTERNAL_H +#define _MFS_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MFS_NAME "mfs" + +#define MFS_OPEN_FLAGS (O_NOATIME) +#define MFS_EVENT_NEW XA_MARK_1 + +/* mfs_sb_info flags */ +#define MFS_MOUNTED 0 + +/* mfs_caches flags */ +#define MFS_CACHE_READY 0 +#define MFS_CACHE_OPENED 1 + +struct mfs_cache_object { + struct file *cache_file; + struct inode *mfs_inode; + + struct rw_semaphore rwsem; + int fd; /* file handle */ + struct file *anon_file; /* related with fd */ +}; + +struct mfs_syncer { + atomic_t notback; + struct list_head head; + spinlock_t list_lock; + struct completion done; + atomic_t res; +}; + +struct mfs_event { + refcount_t ref; + struct mfs_cache_object *object; + struct mfs_syncer *syncer; + struct list_head link; + struct mfs_msg msg; +}; + +struct mfs_caches { + struct xarray events; + wait_queue_head_t pollwq; + unsigned long next_msg; + unsigned long next_ev; + unsigned long flags; +}; + +struct mfs_sb_info { + int mode; + char *mtree; + char *cachedir; + struct path lower; + struct path cache; + + int minor; + + unsigned long flags; + struct super_block *sb; + + struct mfs_caches caches; +}; + +struct mfs_inode { + struct inode *lower; + struct inode *cache; + struct mutex lock; + struct inode vfs_inode; +}; + +struct mfs_file_info { + struct file *lower; + struct file *cache; + const struct vm_operations_struct *cache_vm_ops; +}; + +struct mfs_dentry_info { + spinlock_t lock; + struct path lower; + struct path cache; +}; + +#define MFS_SB(sb) ((struct mfs_sb_info *)(sb)->s_fs_info) +#define MFS_I(ptr) container_of(ptr, struct mfs_inode, vfs_inode) +#define MFS_D(dent) ((struct mfs_dentry_info *)(dent)->d_fsdata) + +extern const struct file_operations mfs_dir_fops; +extern const struct file_operations mfs_file_fops; +extern const struct address_space_operations mfs_aops; +extern const struct vm_operations_struct mfs_file_vm_ops; + +static inline struct inode *mfs_lower_inode(const struct inode *i) +{ + return MFS_I(i)->lower; +} + +static inline void pathcpy(struct path *dst, const struct path *src) +{ + dst->dentry = src->dentry; + dst->mnt = src->mnt; +} + +/* + * dent: mfs vfs dentry + */ +static inline void mfs_get_path(const struct dentry *dent, + struct path *lpath, + struct path *cpath) +{ + spin_lock(&MFS_D(dent)->lock); + pathcpy(lpath, &MFS_D(dent)->lower); + path_get(lpath); + pathcpy(cpath, &MFS_D(dent)->cache); + path_get(cpath); + spin_unlock(&MFS_D(dent)->lock); +} + +static inline void mfs_put_path(struct path *lpath, struct path *cpath) +{ + path_put(lpath); + path_put(cpath); +} + +static inline void mfs_install_path(const struct dentry *dent, + struct path *lpath, + struct path *cpath) +{ + spin_lock(&MFS_D(dent)->lock); + pathcpy(&MFS_D(dent)->lower, lpath); + pathcpy(&MFS_D(dent)->cache, cpath); + spin_unlock(&MFS_D(dent)->lock); +} + +static inline void mfs_release_path(const struct dentry *dent) +{ + struct path lpath, cpath; + + if (!dent || !dent->d_fsdata) + return; + spin_lock(&MFS_D(dent)->lock); + pathcpy(&lpath, &MFS_D(dent)->lower); + pathcpy(&cpath, &MFS_D(dent)->cache); + MFS_D(dent)->lower.dentry = NULL; + MFS_D(dent)->lower.mnt = NULL; + MFS_D(dent)->cache.dentry = NULL; + MFS_D(dent)->cache.mnt = NULL; + path_put(&lpath); + path_put(&cpath); + spin_unlock(&MFS_D(dent)->lock); +} + +static inline void mfs_inode_lock(struct inode *inode) +{ + mutex_lock(&MFS_I(inode)->lock); +} + +static inline void mfs_inode_unlock(struct inode *inode) +{ + mutex_unlock(&MFS_I(inode)->lock); +} + +static inline bool support_event(struct mfs_sb_info *sbi) +{ + return sbi->mode != MFS_MODE_NONE; +} + +static inline bool need_sync_event(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + + return sbi->mode == MFS_MODE_REMOTE; +} + +static inline bool cache_is_ready(struct mfs_sb_info *sbi) +{ + return test_bit(MFS_CACHE_READY, &sbi->caches.flags); +} + +static inline void get_mfs_event(struct mfs_event *event) +{ + refcount_inc(&event->ref); +} + +static inline void put_mfs_event(struct mfs_event *event) +{ + if (refcount_dec_and_test(&event->ref)) { + iput(event->object->mfs_inode); + kfree(event); + } +} + +struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode, + struct path *cache_path); +int mfs_alloc_dentry_info(struct dentry *dentry); +void mfs_free_dentry_info(struct dentry *dentry); + +int mfs_fs_dev_init(struct super_block *sb); +void mfs_fs_dev_exit(struct super_block *sb); +int mfs_dev_init(void); +void mfs_dev_exit(void); + +struct mfs_event *mfs_pick_event(struct xa_state *xas, + unsigned long xa_max); +void mfs_post_event_read(struct mfs_cache_object *object, + loff_t off, uint64_t len, + struct mfs_syncer *syncer, int op); +void mfs_destroy_events(struct super_block *sb); +void mfs_cancel_syncer_events(struct mfs_cache_object *object, + struct mfs_syncer *syncer); +void mfs_cancel_all_events(struct mfs_sb_info *sbi); +int try_hook_fd(struct mfs_event *event); +struct mfs_cache_object *mfs_alloc_object(struct inode *inode, + struct path *cache_path); +void mfs_free_object(void *data); +int mfs_cache_init(void); +void mfs_cache_exit(void); + +#endif diff --git a/fs/mfs/super.c b/fs/mfs/super.c new file mode 100644 index 0000000000000000000000000000000000000000..91b1b13b0657290062bdf1f5733ce9085f8559a3 --- /dev/null +++ b/fs/mfs/super.c @@ -0,0 +1,499 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* + * Used for alloc_inode + */ +static struct kmem_cache *mfs_inode_cachep; + +/* + * Used for dentry info + */ +static struct kmem_cache *mfs_dentry_cachep; + +static void mfs_init_once(void *obj) +{ + struct mfs_inode *i = obj; + + inode_init_once(&i->vfs_inode); +} + +static struct inode *mfs_alloc_inode(struct super_block *sb) +{ + struct mfs_inode *vi = alloc_inode_sb(sb, mfs_inode_cachep, GFP_KERNEL); + + if (!vi) + return NULL; + memset(vi, 0, offsetof(struct mfs_inode, vfs_inode)); + mutex_init(&vi->lock); + return &vi->vfs_inode; +} + +static void mfs_free_inode(struct inode *inode) +{ + struct mfs_inode *vi = MFS_I(inode); + + kmem_cache_free(mfs_inode_cachep, vi); +} + +static void mfs_evict_inode(struct inode *inode) +{ + struct mfs_inode *vi = MFS_I(inode); + struct inode *lower_inode = vi->lower; + struct inode *cache_inode = vi->cache; + + truncate_inode_pages_final(&inode->i_data); + if (inode->i_private) + mfs_free_object(inode->i_private); + clear_inode(inode); + if (lower_inode) { + vi->lower = NULL; + iput(lower_inode); + } + if (cache_inode) { + vi->cache = NULL; + iput(cache_inode); + } +} + +int mfs_alloc_dentry_info(struct dentry *dentry) +{ + struct mfs_dentry_info *info = + kmem_cache_zalloc(mfs_dentry_cachep, GFP_ATOMIC); + + if (!info) + return -ENOMEM; + spin_lock_init(&info->lock); + dentry->d_fsdata = info; + return 0; +} + +void mfs_free_dentry_info(struct dentry *dentry) +{ + if (!dentry || !dentry->d_fsdata) + return; + + kmem_cache_free(mfs_dentry_cachep, dentry->d_fsdata); + dentry->d_fsdata = NULL; +} + +static void mfs_d_release(struct dentry *dentry) +{ + /* for root, the path will release with super block */ + if (!IS_ROOT(dentry)) + mfs_release_path(dentry); + + mfs_free_dentry_info(dentry); +} + +static const struct dentry_operations mfs_dops = { + .d_release = mfs_d_release, +}; + +static int mfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct mfs_sb_info *sbi = MFS_SB(dentry->d_sb); + int err = vfs_statfs(&sbi->cache, buf); + + buf->f_type = MFS_SUPER_MAGIC; + /* Use the reserved slot to keep the device id */ + buf->f_spare[0] = sbi->minor; + return err; +} + +static int mfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct mfs_sb_info *sbi = MFS_SB(root->d_sb); + + if (sbi->mtree) + seq_show_option(seq, "mtree", sbi->mtree); + if (sbi->cachedir) + seq_show_option(seq, "cachedir", sbi->cachedir); + switch (sbi->mode) { + case MFS_MODE_NONE: + seq_puts(seq, ",mode=none"); + break; + case MFS_MODE_LOCAL: + seq_puts(seq, ",mode=local"); + break; + case MFS_MODE_REMOTE: + seq_puts(seq, ",mode=remote"); + break; + } + return 0; +} + +static const struct super_operations mfs_sops = { + .alloc_inode = mfs_alloc_inode, + .free_inode = mfs_free_inode, + .drop_inode = generic_delete_inode, + .evict_inode = mfs_evict_inode, + .statfs = mfs_statfs, + .show_options = mfs_show_options, +}; + +enum { + Opt_mtree, + Opt_cachedir, + Opt_mode, +}; + +static const struct constant_table mfs_param_mode[] = { + {"none", MFS_MODE_NONE}, + {"local", MFS_MODE_LOCAL}, + {"remote", MFS_MODE_REMOTE}, + {} +}; + +static const struct fs_parameter_spec mfs_fs_parameters[] = { + fsparam_string("mtree", Opt_mtree), + fsparam_string("cachedir", Opt_cachedir), + fsparam_enum("mode", Opt_mode, mfs_param_mode), + {} +}; + +static char *remove_trailing(char *s, char c) +{ + size_t size; + char *end; + + size = strlen(s); + if (!size) + return s; + + end = s + size - 1; + while (end >= s && c == *end) + end--; + *(end + 1) = '\0'; + return s; +} + +static char *_acquire_set_path(char *inputpath, struct path *target) +{ + char *p, *realp, *path; + char *res; + int ret = 0; + + p = kstrdup(inputpath, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + realp = remove_trailing(p, '/'); + if (strlen(realp) == 0) { + kfree(p); + return ERR_PTR(-EINVAL); + } + ret = kern_path(realp, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, target); + kfree(p); + if (ret) + return ERR_PTR(ret); + + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) { + path_put(target); + return ERR_PTR(-ENOMEM); + } + + realp = d_path(target, path, PATH_MAX); + if (IS_ERR(realp)) { + path_put(target); + res = realp; + goto free; + } + + res = kstrdup(realp, GFP_KERNEL); + if (!res) { + path_put(target); + res = ERR_PTR(-ENOMEM); + } +free: + kfree(path); + return res; +} + +static int mfs_fc_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct mfs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + struct path target; + char *p; + int opt; + + opt = fs_parse(fc, mfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_mtree: + p = _acquire_set_path(param->string, &target); + if (IS_ERR(p)) + return PTR_ERR(p); + sbi->mtree = p; + pathcpy(&sbi->lower, &target); + break; + case Opt_cachedir: + p = _acquire_set_path(param->string, &target); + if (IS_ERR(p)) + return PTR_ERR(p); + sbi->cachedir = p; + pathcpy(&sbi->cache, &target); + break; + case Opt_mode: + sbi->mode = result.int_32; + break; + default: + return -ENOPARAM; + } + return 0; +} + +static int mfs_fc_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + struct inode *inode; + int err = 0; + + if (!sbi->cachedir || !sbi->mtree) { + pr_err("Lack of mtree or cachedir option.\n"); + return -EINVAL; + } + + if (sbi->mode != MFS_MODE_REMOTE) { + if (strcmp(sbi->cachedir, sbi->mtree)) { + pr_err("local/none mode require the same mtree and cachedir.\n"); + return -EINVAL; + } + } else { + if (!strcmp(sbi->cachedir, sbi->mtree)) { + pr_err("remote mode require different mtree and cachedir.\n"); + return -EINVAL; + } + if (strlen(sbi->cachedir) > strlen(sbi->mtree) && + strncmp(sbi->mtree, sbi->cachedir, strlen(sbi->mtree)) == 0) { + pr_err("remote mode mtree should not be parent of cachedir.\n"); + return -EINVAL; + } + } + + sb->s_stack_depth = max(sbi->lower.mnt->mnt_sb->s_stack_depth, + sbi->cache.mnt->mnt_sb->s_stack_depth) + 1; + if (sb->s_stack_depth > 1) { + pr_err("cannot be stacked on other stackable file system.\n"); + return -EINVAL; + } + + sb->s_magic = MFS_SUPER_MAGIC; + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &mfs_sops; + sb->s_d_op = &mfs_dops; + err = super_setup_bdi(sb); + if (err) + return err; + + if (support_event(sbi)) { + err = mfs_fs_dev_init(sb); + if (err) + return err; + } + + inode = mfs_iget(sb, d_inode(sbi->lower.dentry), &sbi->cache); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_exit; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + err = -ENOMEM; + goto out_iput; + } + + err = mfs_alloc_dentry_info(sb->s_root); + if (err) + goto out_dput; + mfs_install_path(sb->s_root, &sbi->lower, &sbi->cache); + sbi->sb = sb; + set_bit(MFS_MOUNTED, &sbi->flags); + return 0; +out_dput: + dput(sb->s_root); +out_iput: + iput(inode); +out_exit: + if (support_event(sbi)) + mfs_fs_dev_exit(sb); + return err; +} + +static int mfs_fc_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, mfs_fc_fill_super); +} + +static int mfs_reconfigure(struct fs_context *fc) +{ + return -EOPNOTSUPP; +} + +static void mfs_fc_free(struct fs_context *fc) +{ + struct mfs_sb_info *sbi = fc->s_fs_info; + + if (!sbi) + return; + + if (sbi->mtree) { + path_put(&sbi->lower); + kfree(sbi->mtree); + } + if (sbi->cachedir) { + path_put(&sbi->cache); + kfree(sbi->cachedir); + } + kfree(sbi); +} + +static const struct fs_context_operations mfs_context_ops = { + .parse_param = mfs_fc_parse_param, + .get_tree = mfs_fc_get_tree, + .reconfigure = mfs_reconfigure, + .free = mfs_fc_free, +}; + +static int mfs_init_fs_context(struct fs_context *fc) +{ + struct mfs_sb_info *sbi; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + init_waitqueue_head(&sbi->caches.pollwq); + xa_init_flags(&sbi->caches.events, XA_FLAGS_ALLOC); + sbi->minor = -1; + fc->s_fs_info = sbi; + fc->ops = &mfs_context_ops; + return 0; +} + +static void mfs_kill_sb(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + struct mfs_caches *caches = &sbi->caches; + + clear_bit(MFS_MOUNTED, &sbi->flags); + if (support_event(sbi)) { + while (test_bit(MFS_CACHE_OPENED, &caches->flags)) { + static DEFINE_RATELIMIT_STATE(busy_open, 30 * HZ, 1); + + msleep(100); + if (!__ratelimit(&busy_open)) + continue; + pr_warn("Pending until close the /dev/mfs%u...\n", sbi->minor); + } + mfs_fs_dev_exit(sb); + } + kill_anon_super(sb); + mfs_destroy_events(sb); + if (sbi->mtree) { + path_put(&sbi->lower); + kfree(sbi->mtree); + } + if (sbi->cachedir) { + path_put(&sbi->cache); + kfree(sbi->cachedir); + } + kfree(sbi); + sb->s_fs_info = NULL; +} + +static struct file_system_type mfs_fs_type = { + .owner = THIS_MODULE, + .name = MFS_NAME, + .init_fs_context = mfs_init_fs_context, + .kill_sb = mfs_kill_sb, + .fs_flags = 0, +}; +MODULE_ALIAS_FS(MFS_NAME); + +static int __init init_mfs_fs(void) +{ + int err; + + mfs_inode_cachep = + kmem_cache_create("mfs_inode", + sizeof(struct mfs_inode), 0, + SLAB_RECLAIM_ACCOUNT, mfs_init_once); + if (!mfs_inode_cachep) + return -ENOMEM; + + mfs_dentry_cachep = + kmem_cache_create("mfs_dentry", + sizeof(struct mfs_dentry_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!mfs_dentry_cachep) { + err = -ENOMEM; + goto err_dentryp; + } + + err = mfs_cache_init(); + if (err) + goto err_cache; + + err = register_filesystem(&mfs_fs_type); + if (err) + goto err_register; + + err = mfs_dev_init(); + if (err) + goto err_dev; + + pr_info("MFS module loaded\n"); + return 0; +err_dev: + unregister_filesystem(&mfs_fs_type); +err_register: + mfs_cache_exit(); +err_cache: + kmem_cache_destroy(mfs_dentry_cachep); +err_dentryp: + kmem_cache_destroy(mfs_inode_cachep); + return err; +} + +static void __exit exit_mfs_fs(void) +{ + mfs_dev_exit(); + unregister_filesystem(&mfs_fs_type); + + /* Make sure all delayed rcu free inodes are safe to be destroyed. */ + rcu_barrier(); + mfs_cache_exit(); + kmem_cache_destroy(mfs_dentry_cachep); + kmem_cache_destroy(mfs_inode_cachep); + pr_info("MFS module unload\n"); +} + +module_init(init_mfs_fs); +module_exit(exit_mfs_fs); + +MODULE_AUTHOR("Hongbo Li "); +MODULE_AUTHOR("Xiaojia Huang "); +MODULE_DESCRIPTION("MFS filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/include/trace/events/mfs.h b/include/trace/events/mfs.h new file mode 100644 index 0000000000000000000000000000000000000000..5963888d0993e833b079bdf8337f45d63491803e --- /dev/null +++ b/include/trace/events/mfs.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mfs + +#if !defined(_TRACE_MFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MFS_H + +#include +#include + +TRACE_EVENT(mfs_lookup, + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flag), + TP_ARGS(dir, dentry, flag), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __string(name, dentry->d_name.name) + __field(unsigned int, flag) + ), + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __assign_str(name, dentry->d_name.name); + __entry->flag = flag; + ), + + TP_printk("dev=%d ino=%lu name=%s flag=%x", + MINOR(__entry->dev), __entry->ino, __get_str(name), __entry->flag) +); + +DECLARE_EVENT_CLASS(mfs_file_normal, + TP_PROTO(struct inode *inode, struct file *file), + TP_ARGS(inode, file), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(mode_t, mode) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = file->f_mode; + ), + + TP_printk("dev=%d ino=%lu mode=%o", + MINOR(__entry->dev), __entry->ino, __entry->mode) +); + +DEFINE_EVENT(mfs_file_normal, mfs_open, + TP_PROTO(struct inode *inode, struct file *file), + TP_ARGS(inode, file) +); + +DEFINE_EVENT(mfs_file_normal, mfs_release, + TP_PROTO(struct inode *inode, struct file *file), + TP_ARGS(inode, file) +); + +TRACE_EVENT(mfs_post_event_read, + TP_PROTO(struct inode *inode, loff_t off, uint64_t len, int op), + TP_ARGS(inode, off, len, op), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, off) + __field(uint64_t, len) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->off = off; + __entry->len = len; + __entry->op = op; + ), + + TP_printk("(miss) dev=%d ino=%lu off=%lld len=%llu op=%d", + MINOR(__entry->dev), __entry->ino, __entry->off, __entry->len, __entry->op) +); + +TRACE_EVENT(mfs_dev_read, + TP_PROTO(struct file *file, int op, uint32_t msgid, uint32_t fd), + TP_ARGS(file, op, msgid, fd), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(int, op) + __field(uint32_t, msgid) + __field(uint32_t, fd) + ), + TP_fast_assign( + __entry->dev = file->f_inode->i_sb->s_dev; + __entry->ino = file->f_inode->i_ino; + __entry->op = op; + __entry->msgid = msgid; + __entry->fd = fd; + ), + + TP_printk("dev=%d ino=%lu op=%d msgid=%u fd=%u", + MINOR(__entry->dev), __entry->ino, __entry->op, __entry->msgid, __entry->fd) +); + +#endif /* _TRACE_MFS_H */ + +#include diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f5dcdc7bdc91d612f8fc4c7b40135..4ca73708ed9edc69252985742f6de6d54908dabd 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -37,6 +37,7 @@ #define HOSTFS_SUPER_MAGIC 0x00c0ffee #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #define FUSE_SUPER_MAGIC 0x65735546 +#define MFS_SUPER_MAGIC 0x85428370 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ diff --git a/include/uapi/linux/mfs.h b/include/uapi/linux/mfs.h new file mode 100644 index 0000000000000000000000000000000000000000..a7d5882b5e500082920cc4208aaaf10ebf615a64 --- /dev/null +++ b/include/uapi/linux/mfs.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_MFS_H +#define _UAPI_LINUX_MFS_H + +#include +#include + +enum mfs_opcode { + MFS_OP_READ = 0, + MFS_OP_FAULT, + MFS_OP_FAROUND, +}; + +enum { + MFS_MODE_NONE = 0, + MFS_MODE_LOCAL, + MFS_MODE_REMOTE, +}; + +struct mfs_ioc_ra { + __u64 off; + __u64 len; +}; + +struct mfs_ioc_done { + __u32 id; + __u32 ret; +}; + +struct mfs_ioc_rpath { + __u16 max; + __u16 len; + __u8 d[]; +}; + +#define MFS_IOC_RA _IOW(0xbc, 1, struct mfs_ioc_ra) +#define MFS_IOC_DONE _IOW(0xbc, 2, struct mfs_ioc_done) +#define MFS_IOC_RPATH _IOWR(0xbc, 3, struct mfs_ioc_rpath) + +struct mfs_ioc_fsinfo { + __u8 mode; /* 0: none, 1: local, 2: remote */ +}; + +#define MFS_IOC_FSINFO _IOR(0xbd, 1, struct mfs_ioc_fsinfo) + +struct mfs_msg { + __u8 version; + __u8 opcode; + __u16 len; + __u32 fd; + __u32 id; + __u8 data[]; +}; + +struct mfs_read { + __u64 off; + __u64 len; + __s32 pid; +}; + +#endif /* _UAPI_LINUX_MFS_H */ diff --git a/tools/mfs/.gitignore b/tools/mfs/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..55428a223113f0f0ff58d49fba34b6afd86d5cce --- /dev/null +++ b/tools/mfs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +mfsd diff --git a/tools/mfs/Makefile b/tools/mfs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..a6e3e970b63e0bf6e43337fbb727e354e8f6df4d --- /dev/null +++ b/tools/mfs/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mfs demo + +CFLAGS = -Wall -Wextra + +PROGS := mfsd + +all: $(PROGS) +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +clean: + $(RM) $(PROGS) diff --git a/tools/mfs/mfsd.c b/tools/mfs/mfsd.c new file mode 100644 index 0000000000000000000000000000000000000000..865b765c31ed45f6009ee291b0a72e4cff74b1fe --- /dev/null +++ b/tools/mfs/mfsd.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * User-space demo of mfs + * + * Example use: + * ./mfsd [mfs_mountpoint] + * mfsd.c demostrates how to poll mfs device, read the events, + * parse the events, process the events according to user mode + * and trigger the ioctls mfs supported. + * + * See Documentation/filesystems/mfs.rst + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../include/uapi/linux/mfs.h" +#include "../../include/uapi/linux/magic.h" + +#define pr_err(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) + +static int mfs_mode = -1; + +static int process_local_read(struct mfs_msg *msg) +{ + struct mfs_read *read = (struct mfs_read *)msg->data; + struct mfs_ioc_ra ra; + int fd = msg->fd; + int ret; + + ra.off = read->off; + ra.len = read->len; + + ret = ioctl(fd, MFS_IOC_RA, &ra); + if (ret) + perror("ioctl MFS_IOC_RA failed"); + + return ret; +} + +static int process_remote_read(struct mfs_msg *msg) +{ + struct mfs_ioc_rpath *rpath; + struct mfs_ioc_done done; + int fd = msg->fd; + int ret; + + rpath = malloc(sizeof(struct mfs_ioc_rpath) + 1024); + + if (!rpath) { + pr_err("malloc for path failed\n"); + return -1; + } + rpath->max = 1024; + ret = ioctl(fd, MFS_IOC_RPATH, (unsigned long)rpath); + if (ret) { + free(rpath); + perror("ioctl failed"); + return -1; + } + free(rpath); + + done.id = msg->id; + done.ret = 0; + ret = ioctl(fd, MFS_IOC_DONE, (unsigned long)&done); + if (ret) + perror("failed to ioctl MFS_IOC_DONE"); + + return ret; +} + +static int process_read(struct mfs_msg *msg) +{ + int ret; + + if (mfs_mode == MFS_MODE_REMOTE) + ret = process_remote_read(msg); + else if (mfs_mode == MFS_MODE_LOCAL) + ret = process_local_read(msg); + else + ret = -EINVAL; + return ret; +} + +static int process_req(int fd) +{ + char buf[1024]; + struct mfs_msg *msg; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = read(fd, buf, sizeof(buf)); + if (ret <= 0) { + if (ret < 0) + pr_err("read failed, ret:%d\n", ret); + return -1; + } + + msg = (void *)buf; + if (ret != msg->len) { + pr_err("invalid message length, read:%d, need:%d\n", ret, msg->len); + return -1; + } + if (msg->opcode == MFS_OP_READ || msg->opcode == MFS_OP_FAULT || + msg->opcode == MFS_OP_FAROUND) { + return process_read(msg); + } + pr_err("invalid opcode:%d\n", msg->opcode); + return -1; +} + +static void ioctl_mfs_mode(int fd) +{ + struct mfs_ioc_fsinfo fsinfo = {0}; + int ret; + + ret = ioctl(fd, MFS_IOC_FSINFO, (unsigned long)&fsinfo); + if (ret < 0) { + perror("failed to ioctl mfs_ioc_fsinfo"); + close(fd); + exit(-1); + } + + mfs_mode = fsinfo.mode; +} + +int main(int argc, char *argv[]) +{ + struct pollfd pfd; + struct statfs buf; + char *mountpoint; + char devname[10]; + int fd, ret; + + if (argc != 2) { + printf("./mfsd ${mfs_mountpoint}\n"); + return -1; + } + mountpoint = argv[1]; + + ret = statfs(mountpoint, &buf); + if (ret) { + pr_err("statfs %s failed\n", mountpoint); + return -1; + } + if (buf.f_type != MFS_SUPER_MAGIC) { + pr_err("fstype(%lx) is invalid, please check the mountpoint\n", buf.f_type); + return -1; + } + + sprintf(devname, "/dev/mfs%ld", buf.f_spare[0]); + fd = open(devname, O_RDWR); + if (fd < 0) { + pr_err("open %s failed\n", devname); + return -1; + } + + ioctl_mfs_mode(fd); + pfd.fd = fd; + pfd.events = POLLIN; + + while (1) { + ret = poll(&pfd, 1, -1); + if (ret < 0) { + pr_err("poll failed\n"); + return -1; + } + + if (ret == 0 || !(pfd.revents & POLLIN)) { + pr_err("poll event error, ret:%d, revents:%x\n", ret, pfd.revents); + continue; + } + + if (process_req(fd) == -1) + pr_err("process req failed, errcode:%d\n", errno); + } + return 0; +}