diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 09cade7eaefc8c68e4e733ea011b345a4ce9fc9b..6ca507138d8ebaf7d6c228e8f4db642b84262669 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -82,6 +82,7 @@ Documentation for filesystem implementations.
    ecryptfs
    efivarfs
    erofs
+   mfs
    ext2
    ext3
    ext4/index
diff --git a/Documentation/filesystems/mfs.rst b/Documentation/filesystems/mfs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a291fbf48cf1fd1e2ed5078ce1fe9d0c7e8b62c3
--- /dev/null
+++ b/Documentation/filesystems/mfs.rst
@@ -0,0 +1,223 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+MFS Filesystem
+==============
+
+Overview
+========
+
+MFS is a stackable file system that utilizes a lower and a cache layer. It
+provides users with programmable caching capabilities. MFS only supports
+read-only operations for regular files, directories and symbolic links. When
+MFS is stacked on top of the lower and cache layers (which are themselves
+mounted on other file systems, such as ext4 or xfs), the underlying file
+systems must be also kept read-only to prevent the data inconsistency.
+
+MFS supports three running mode: none, local and remote. These modes are
+explained in running mode section. In short, MFS requires the `mtree` and
+`cachedir` mount options. The `mtree` option specifies the metadata source
+for MFS, while the `cachedir` option specifies the data source. In local or
+remote mode, `cachedir` points to a local cache (in memory or on disk) for
+backend file systems.
+
+
+Mount options
+=============
+
+================    ==========================================================
+mode=%s             Supported running mode. Here are:
+
+		    ======  ==================================================
+                      none  As a stackable file system on the lower file
+                            system, and just pass through operations to the
+                            backend file system.
+                     local  Working at local mode which lower and cachedir
+                            layer are both local file system. And the miss
+                            event (not hit in page cache) will post the async
+                            events to the userspace.
+                    remote  Working at remote mode which the target data is in
+                            the remote storage such as OBS or other private
+                            distributed file system without POSIX-like
+                            interface. And the miss event (not hit in local
+                            cache) will post the sync events to the userspace
+                            and waiting for replying.
+		    ======  ==================================================
+mtree=%s            Lower layer path.
+cachedir=%s         Cache layer path.
+================    ==========================================================
+
+**NOTE**: The path in `mtree` and `cachedir` options must not be the same as
+the mount point, nor can they be a subdirectories of each other.
+
+Communication Framework
+=======================
+
+Each MFS instance has a unique communication device named `/dev/mfs${minor}`.
+MFS sends the MISS events to the user daemon as needed. The user daemon can
+obtain these events by polling and reading from the device. To obtain the
+minor number for an MFS instance, the user must call `statfs()` on its mount
+point and parse the value from `f_spare[0]` in the `struct statfs`.
+
+Each request starts with a message header of the form::
+
+	struct mfs_msg {
+		__u8 version;
+		__u8 opcode;
+		__u16 len;
+		__u32 fd;
+		__u32 id;
+		__u8 data[];
+	};
+
+where:
+
+	* ``version`` indicates the version number which for extension.
+
+	* ``opcode`` indicates the type of the event.
+
+	* ``len`` indicates the whole length of the event, including the
+	  header and the following type-specific payload.
+
+	* ``fd`` indicates the file handle of internal file object.
+
+	* ``id`` is a unique ID identifying the event.
+
+	* ``data`` indicates the payload of the event.
+
+The MFS will only post reading events when the data is missing in the local
+cache (memory or disk). The payload format is define as follows::
+
+	struct mfs_read {
+		__u64 off;
+		__u64 len;
+		__s32 pid;
+	};
+
+where:
+
+	* ``off`` indicates the offset of the reading request which triggers
+	  this event.
+
+	* ``len`` indicates the length of the reading request which triggers
+	  this event.
+
+	* ``pid`` indicates the pid of the reading process which triggers
+	  this event.
+
+Currently the opcode is defined as follows::
+
+	enum mfs_opcode {
+		MFS_OP_READ = 0,
+		MFS_OP_FAULT,
+		MFS_OP_FAROUND,
+	};
+
+where means: normal read event, page fault event and the fault around event
+before ongoing fault.
+
+Running mode
+============
+
+There are three running mode in MFS: none, local and remote. The user can use
+`MFS_IOC_FSINFO ioctl` on device fd to obtain the information.
+
+The parameter for this request is as follows::
+
+	struct mfs_ioc_fsinfo {
+		__u8 mode;  /* 0: none, 1: local, 2: remote */
+	};
+
+where mode will be assigned the value defined in a enum structure as follows::
+
+	enum {
+		MFS_MODE_NONE = 0,
+		MFS_MODE_LOCAL,
+		MFS_MODE_REMOTE,
+	};
+
+In none mode, MFS does not report any events. It just passes operations
+through to the underlying file system.
+
+In local mode, MFS uses page cache as its local cache. If a read request
+results in a cache miss, the MISS events are reported for the non-contiguous
+missing range. This is an asynchronous event, which means the kernel does not
+block waiting for it. The user daemon can prefetch subsequence data based on
+this event to avoid future cache misses.
+
+In remote mode, MFS uses a local disk (as specified by the `cachedir` mount
+option) as its cache. If a read request misses in the local disk cache (checked
+using `SEEK_HOLE` and `SEEK_DATA`), the MISS events are reported. This is a
+synchronous event, which means the kernel will block on this event and wait
+for the user daemon to respond with the corresponding message id.
+
+The following structure definition will be used for user daemon to respond the
+target events::
+
+	struct mfs_ioc_ra {
+		__u64 off;
+		__u64 len;
+	};
+
+where:
+
+	* ``off`` indicates the offset where to prefetch.
+
+	* ``len`` indicates the length to prefetch.
+
+and it used in local mode by using the `MFS_IOC_RA ioctl` on the `fd` in the
+message header.
+
+When running in remote mode, the use daemon should 1) fetch the target data
+from remote storage, 2) write the data to MFS using the `write()` syscal on
+the `fd` provided in the message header, 3) reply by calling the
+`MFS_IOC_DONE ioctl` with the parameter::
+
+	struct mfs_ioc_done {
+		__u32 id;
+		__u32 ret;
+	};
+
+where:
+
+	* ``id``: indicates the message id in message header.
+
+	* ``ret``: indicates the return code for the event, and 0 means success.
+
+In some cases, the use daemon may need to obtain the full path of file object
+associated with the event to implement more complex strategy, such as one based
+on tracing. To do this, it uses the `MFS_IOC_RPATH ioctl` on `fd` which
+provided in the  message header. The parameter for this is::
+
+	struct mfs_ioc_rpath {
+		__u16 max;
+		__u16 len;
+		__u8 d[];
+	};
+
+where:
+
+	* ``max`` indicates the max length of input data area to fill the full
+	  path.
+
+	* ``len`` indicates the real length of the full path.
+
+	* ``d[]`` indicate the input data area allocated by user daemon.
+
+The user daemon will use the flexible strategy to prefetch the data. So
+in MFS, the default prefetch in VFS is disabled.
+
+
+Use cases
+=========
+
+- Boost model weight loading.
+
+In this case, the user daemon can employ several strategies to improve
+performance, such as: concurrent loading, larger I/O size for read-ahead, numa
+aware allocation and trace-based prefetching triggered by MISS events.
+
+- Tracing the reading io.
+
+In this case, the user daemon can log the MISS io during running the process by
+parsing `offset` and `length` in the message.
diff --git a/MAINTAINERS b/MAINTAINERS
index 33eeabab5088760084dd54e1e562ef8594a5d0aa..927e9f6aa9ce8d7ad2e5edc41acc98c4ca88b8d1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7762,6 +7762,16 @@ F:	Documentation/filesystems/erofs.rst
 F:	fs/erofs/
 F:	include/trace/events/erofs.h
 
+MFS FILE SYSTEM
+M:	Hongbo Li <lihongbo22@huawei.com>
+M:	Xiaojia Huang <huangxiaojia2@huawei.com>
+L:	linux-fsdevel@vger.kernel.org
+S:	Maintained
+F:	Documentation/filesystems/mfs.rst
+F:	fs/mfs/
+F:	include/trace/events/mfs.h
+F:	include/uapi/linux/mfs.h
+
 ERRSEQ ERROR TRACKING INFRASTRUCTURE
 M:	Jeff Layton <jlayton@kernel.org>
 S:	Maintained
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 9902feacdb006812126ae048946c44b6e34d906a..0f119402a78e1bf943b65b582365b050b9d42551 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -7245,6 +7245,7 @@ CONFIG_EROFS_FS_POSIX_ACL=y
 CONFIG_EROFS_FS_SECURITY=y
 # CONFIG_EROFS_FS_ZIP is not set
 CONFIG_EROFS_FS_ONDEMAND=y
+CONFIG_MFS_FS=y
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 CONFIG_NFS_V2=m
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index b5beb81300d26cbb27777cd7f23a34851cfcbe06..38d79af54ec124e843b74d2a919ad6bfe9cb5329 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -8425,6 +8425,7 @@ CONFIG_EROFS_FS_POSIX_ACL=y
 CONFIG_EROFS_FS_SECURITY=y
 # CONFIG_EROFS_FS_ZIP is not set
 CONFIG_EROFS_FS_ONDEMAND=y
+CONFIG_MFS_FS=y
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 # CONFIG_NFS_V2 is not set
diff --git a/fs/Kconfig b/fs/Kconfig
index 92bcdc29e6a88eec55ac8734f75776385690f1ba..6657ada03d066659d4ffd266cf55b17948304e1b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -370,6 +370,7 @@ source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/erofs/Kconfig"
 source "fs/vboxsf/Kconfig"
+source "fs/mfs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index 81428bad22f072162bff406eeae4acca1ec2316d..fa1e3d0678398064138d7c8a0e13f408435676c4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -136,3 +136,4 @@ obj-$(CONFIG_EROFS_FS)		+= erofs/
 obj-$(CONFIG_VBOXSF_FS)		+= vboxsf/
 obj-$(CONFIG_ZONEFS_FS)		+= zonefs/
 obj-$(CONFIG_RESCTRL_FS)	+= resctrl/
+obj-$(CONFIG_MFS_FS)		+= mfs/
\ No newline at end of file
diff --git a/fs/mfs/Kconfig b/fs/mfs/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..26c336a3056e7e2cbfb332a474c9fba1eef41b9a
--- /dev/null
+++ b/fs/mfs/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config MFS_FS
+	tristate "MFS filesystem support"
+	help
+	  MFS provides a cache-programmable ability for users.
+	  It is a stackable file system, and will post miss events
+	  during reading data which is not in cache (memory or disk).
+	  Based on these events, users can trigger the target io for each
+	  application.
+
+	  If unsure, say N.
diff --git a/fs/mfs/Makefile b/fs/mfs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..68c090fb4bc3b26146db362cc15575690ad05b85
--- /dev/null
+++ b/fs/mfs/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_MFS_FS) += mfs.o
+mfs-objs := super.o inode.o data.o dev.o cache.o
diff --git a/fs/mfs/cache.c b/fs/mfs/cache.c
new file mode 100644
index 0000000000000000000000000000000000000000..2675fe58b781f1a46b20afb778582bc2afcd0261
--- /dev/null
+++ b/fs/mfs/cache.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include "internal.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/mfs.h>
+#include <linux/fadvise.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+
+#include <trace/events/mfs.h>
+
+/*
+ * Used for cache object
+ */
+static struct kmem_cache *mfs_cobject_cachep;
+
+static int fd_release(struct inode *inode, struct file *file)
+{
+	struct mfs_cache_object *object = file->private_data;
+
+	down_write(&object->rwsem);
+	if (object->fd > 0) {
+		object->fd = -1;
+		object->anon_file = NULL;
+		iput(object->mfs_inode);
+	}
+	up_write(&object->rwsem);
+	return 0;
+}
+
+static ssize_t fd_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *ori_file = iocb->ki_filp;
+	struct mfs_cache_object *object = ori_file->private_data;
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+	ssize_t ret;
+
+	if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags))
+		return -EINVAL;
+	if (sbi->mode != MFS_MODE_REMOTE)
+		return -EOPNOTSUPP;
+
+	iocb->ki_filp = object->cache_file;
+	ret = vfs_iocb_iter_write(object->cache_file, iocb, iter);
+	iocb->ki_filp = ori_file;
+	return ret;
+}
+
+static loff_t fd_llseek(struct file *filp, loff_t pos, int whence)
+{
+	struct mfs_cache_object *object = filp->private_data;
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+
+	if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags))
+		return -EINVAL;
+	if (sbi->mode != MFS_MODE_REMOTE)
+		return -EOPNOTSUPP;
+
+	return vfs_llseek(object->cache_file, pos, whence);
+}
+
+/* Used for sync events */
+static long _ioc_done(struct mfs_cache_object *object,
+			 struct mfs_ioc_done *done)
+{
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+	struct mfs_caches *caches = &sbi->caches;
+	XA_STATE(xas, &caches->events, done->id);
+	struct mfs_syncer *syncer;
+	struct mfs_event *event;
+
+	xas_lock(&xas);
+	event = xas_load(&xas);
+	if (!event || event->object != object) {
+		xa_unlock(&caches->events);
+		return -EINVAL;
+	}
+	xas_store(&xas, NULL);
+	syncer = event->syncer;
+	if (done->ret)
+		atomic_cmpxchg(&syncer->res, 0, -EIO);
+	spin_lock(&syncer->list_lock);
+	list_del(&event->link);
+	spin_unlock(&syncer->list_lock);
+	if (atomic_dec_return(&syncer->notback) == 0)
+		complete(&syncer->done);
+	xas_unlock(&xas);
+
+	put_mfs_event(event);
+	return 0;
+}
+
+static void force_ra(struct address_space *mapping, struct file *file,
+			pgoff_t start, pgoff_t end)
+{
+	unsigned long default_pages = (4 * 1024 * 1024) / PAGE_SIZE;
+	DEFINE_READAHEAD(ractl, file, NULL, mapping, start);
+	pgoff_t index = start;
+	unsigned long nr_to_read;
+
+	nr_to_read = end - start + 1;
+	while (nr_to_read) {
+		if (default_pages > nr_to_read)
+			default_pages = nr_to_read;
+		if (index > end)
+			return;
+		ractl._index = index;
+		page_cache_ra_unbounded(&ractl, default_pages, 0);
+		index += default_pages;
+		nr_to_read -= default_pages;
+	}
+}
+
+/* Used for async events */
+static long _ioc_ra(struct mfs_cache_object *object,
+		      struct mfs_ioc_ra *ra)
+{
+	struct file *file = object->cache_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = file_inode(file);
+	loff_t endbyte, isize;
+	pgoff_t start, end;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		return 0;
+	if (ra->off >= isize)
+		return -EINVAL;
+	endbyte = (u64)ra->off + (u64)ra->len;
+	if (!ra->len || endbyte < ra->len)
+		endbyte = LLONG_MAX;
+	else
+		endbyte--;
+	endbyte = min_t(loff_t, endbyte, isize);
+
+	start = ra->off >> PAGE_SHIFT;
+	end = endbyte >> PAGE_SHIFT;
+
+	force_ra(mapping, file, start, end);
+	return 0;
+}
+
+static long fd_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct mfs_cache_object *object = filp->private_data;
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+	int ret = 0;
+
+	if (!test_bit(MFS_CACHE_READY, &sbi->caches.flags))
+		return -EINVAL;
+
+	switch (cmd) {
+	case MFS_IOC_DONE:
+	{
+		struct mfs_ioc_done done;
+
+		if (sbi->mode != MFS_MODE_REMOTE)
+			return -EOPNOTSUPP;
+		if (copy_from_user(&done, (void __user *)arg, sizeof(done)))
+			return -EFAULT;
+		ret = _ioc_done(object, &done);
+		break;
+	}
+	case MFS_IOC_RA:
+	{
+		struct mfs_ioc_ra ra;
+
+		if (sbi->mode != MFS_MODE_LOCAL)
+			return -EOPNOTSUPP;
+		if (copy_from_user(&ra, (void __user *)arg, sizeof(ra)))
+			return -EFAULT;
+		ret = _ioc_ra(object, &ra);
+		break;
+	}
+	case MFS_IOC_RPATH:
+	{
+		struct mfs_ioc_rpath __user *ua = (struct mfs_ioc_rpath __user *)arg;
+		struct mfs_ioc_rpath *rpath;
+		int plen, clen;
+		u32 bytes;
+		char *p;
+
+		if (get_user(bytes, &ua->max))
+			return -EFAULT;
+		rpath = kzalloc(bytes + sizeof(struct mfs_ioc_rpath), GFP_KERNEL);
+		if (!rpath)
+			return -ENOMEM;
+
+		rpath->max = bytes;
+		p = file_path(object->cache_file, rpath->d, rpath->max);
+		if (IS_ERR(p)) {
+			kfree(rpath);
+			return PTR_ERR(p);
+		}
+		plen = strlen(p), clen = strlen(sbi->cachedir);
+		if (plen <= clen) {
+			kfree(rpath);
+			return -EFAULT;
+		}
+		rpath->len = plen - clen;
+		/* include the tailing nil */
+		memmove(rpath->d, p + clen, rpath->len + 1);
+		if (copy_to_user((void __user *)arg, rpath,
+				  rpath->len + 1 + sizeof(struct mfs_ioc_rpath)))
+			ret = -EFAULT;
+		kfree(rpath);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
+static const struct file_operations mfs_fd_fops = {
+	.owner		= THIS_MODULE,
+	.release	= fd_release,
+	.write_iter	= fd_write_iter,
+	.llseek		= fd_llseek,
+	.unlocked_ioctl = fd_ioctl,
+};
+
+static int mfs_setup_object(struct mfs_cache_object *object,
+				 struct inode *inode,
+				 struct path *cache_path)
+{
+	struct inode *cache_inode = d_inode(cache_path->dentry);
+	struct file *cache_file;
+	int flags = O_RDONLY;
+
+	if (need_sync_event(inode->i_sb))
+		flags = O_RDWR;
+	cache_file = kernel_file_open(cache_path, flags | O_LARGEFILE,
+				      cache_inode, current_cred());
+	if (IS_ERR(cache_file))
+		return PTR_ERR(cache_file);
+	/*
+	 * object belongs to a mfs inode,
+	 * this is a reverse pointer, no refcount needed.
+	 */
+	object->mfs_inode = inode;
+	object->cache_file = cache_file;
+	init_rwsem(&object->rwsem);
+	object->fd = -1;
+	object->anon_file = NULL;
+	return 0;
+}
+
+struct mfs_event *mfs_pick_event(struct xa_state *xas,
+				 unsigned long xa_max)
+{
+	struct mfs_event *event;
+
+	xas_for_each_marked(xas, event, xa_max, MFS_EVENT_NEW) {
+		return event;
+	}
+
+	return NULL;
+}
+
+void mfs_post_event_read(struct mfs_cache_object *object,
+			       loff_t off, uint64_t len,
+			       struct mfs_syncer *syncer, int op)
+{
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+	struct mfs_caches *caches = &sbi->caches;
+	XA_STATE(xas, &caches->events, 0);
+	struct mfs_event *event;
+	struct mfs_read *msg;
+	int ret;
+
+	/* 1. init event struct */
+	event = kzalloc(sizeof(*event) + sizeof(*msg), GFP_KERNEL);
+	if (!event) {
+		pr_warn("post read event failed, off:%lld, len:%llu\n", off, len);
+		return;
+	}
+
+	/* 2. hold object's owner mfs_inode */
+	ihold(object->mfs_inode);
+	trace_mfs_post_event_read(object->mfs_inode, off, len, op);
+	refcount_set(&event->ref, 1);
+	event->object = object;
+	event->msg.version = 0;
+	event->msg.opcode = op;
+	event->msg.len = sizeof(struct mfs_msg) + sizeof(struct mfs_read);
+	event->msg.fd = object->fd;
+	msg = (void *)event->msg.data;
+	msg->off = off;
+	msg->len = len;
+	msg->pid = current->pid;
+	INIT_LIST_HEAD(&event->link);
+	event->syncer = syncer;
+	if (event->syncer) {
+		atomic_inc(&syncer->notback);
+		spin_lock(&syncer->list_lock);
+		list_add_tail(&event->link, &syncer->head);
+		spin_unlock(&syncer->list_lock);
+	}
+
+	/* 3. put event into reqs' xarray */
+	do {
+		xas_lock(&xas);
+
+		if (!test_bit(MFS_CACHE_READY, &caches->flags)) {
+			xas_unlock(&xas);
+			goto out;
+		}
+
+		/* Ensure cache enabled judgement before posting events */
+		smp_mb__after_atomic();
+
+		xas.xa_index = caches->next_msg;
+		xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+		if (xas.xa_node == XAS_RESTART) {
+			xas.xa_index = 0;
+			xas_find_marked(&xas, caches->next_msg - 1, XA_FREE_MARK);
+		}
+		if (xas.xa_node == XAS_RESTART)
+			xas_set_err(&xas, -EBUSY);
+		xas_store(&xas, event);
+		if (xas_valid(&xas)) {
+			caches->next_msg = xas.xa_index + 1;
+			event->msg.id = xas.xa_index;
+			xas_clear_mark(&xas, XA_FREE_MARK);
+			xas_set_mark(&xas, MFS_EVENT_NEW);
+		}
+		xas_unlock(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+
+	ret = xas_error(&xas);
+	if (ret) {
+		pr_warn("post read event failed to insert events, off:%lld, len:%llu, ret:%d\n",
+			off, len, ret);
+		goto out;
+	}
+
+	/* 3. wakeup the polling wait list */
+	wake_up_all(&caches->pollwq);
+	return;
+out:
+	if (event->syncer) {
+		spin_lock(&syncer->list_lock);
+		list_del_init(&event->link);
+		spin_unlock(&syncer->list_lock);
+		atomic_dec(&syncer->notback);
+	}
+	kfree(event);
+	iput(object->mfs_inode);
+}
+
+void mfs_destroy_events(struct super_block *sb)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+	struct mfs_caches *caches = &sbi->caches;
+	unsigned long index;
+	struct mfs_event *event;
+
+	xa_lock(&caches->events);
+	xa_for_each(&caches->events, index, event) {
+		/*
+		 * Inodes will be evicted before destroy events.
+		 * Hence there should be none of events.
+		 */
+		pr_warn("Event remains:%lu\n", index);
+		__xa_erase(&caches->events, index);
+		put_mfs_event(event);
+	}
+	xa_unlock(&caches->events);
+	xa_destroy(&caches->events);
+}
+
+void mfs_cancel_syncer_events(struct mfs_cache_object *object,
+			      struct mfs_syncer *syncer)
+{
+	struct mfs_sb_info *sbi = MFS_SB(object->mfs_inode->i_sb);
+	struct mfs_caches *caches = &sbi->caches;
+	struct xarray *xa = &caches->events;
+	struct mfs_event *event, *nevent;
+
+	xa_lock(xa);
+	spin_lock(&syncer->list_lock);
+	list_for_each_entry_safe(event, nevent, &syncer->head, link) {
+		__xa_erase(&caches->events, event->msg.id);
+		list_del(&event->link);
+		put_mfs_event(event);
+	}
+	spin_unlock(&syncer->list_lock);
+	xa_unlock(xa);
+}
+
+void mfs_cancel_all_events(struct mfs_sb_info *sbi)
+{
+	struct mfs_caches *caches = &sbi->caches;
+	struct xarray *xa = &caches->events;
+	struct mfs_syncer *syncer;
+	struct mfs_event *event;
+	unsigned long index;
+
+	while (!xa_empty(xa)) {
+		xa_lock(xa);
+		xa_for_each(xa, index, event) {
+			__xa_erase(xa, index);
+			syncer = event->syncer;
+			/*
+			 * Here should keep syncer (a stack variable), so we should
+			 * wakeup the syncer list in the protect of xa lock.
+			 */
+			if (syncer) {
+				spin_lock(&syncer->list_lock);
+				list_del(&event->link);
+				spin_unlock(&syncer->list_lock);
+				if (atomic_dec_return(&syncer->notback) == 0) {
+					atomic_cmpxchg(&syncer->res, 0, -EIO);
+					complete(&syncer->done);
+				}
+			}
+			put_mfs_event(event);
+			if (need_resched())
+				break;
+		}
+		xa_unlock(xa);
+		cond_resched();
+	}
+	caches->next_ev = 0;
+	caches->next_msg = 0;
+}
+
+int try_hook_fd(struct mfs_event *event)
+{
+	struct mfs_cache_object *object = event->object;
+	struct file *anon_file;
+	int fd;
+
+	down_read(&object->rwsem);
+	if (object->fd > 0) {
+		up_read(&object->rwsem);
+		return object->fd;
+	}
+	up_read(&object->rwsem);
+	down_write(&object->rwsem);
+	fd = get_unused_fd_flags(O_WRONLY);
+	if (fd < 0) {
+		up_write(&object->rwsem);
+		return fd;
+	}
+
+	anon_file = anon_inode_getfile("[mfs]", &mfs_fd_fops, object, O_WRONLY);
+	if (IS_ERR(anon_file)) {
+		put_unused_fd(fd);
+		up_write(&object->rwsem);
+		return PTR_ERR(anon_file);
+	}
+	anon_file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+	object->fd = fd;
+	object->anon_file = anon_file;
+	/* lifecyle of fd/anon_file should later than mfs_inode */
+	ihold(object->mfs_inode);
+	fd_install(fd, anon_file);
+	up_write(&object->rwsem);
+	return fd;
+}
+
+struct mfs_cache_object *mfs_alloc_object(struct inode *inode,
+					       struct path *cache_path)
+{
+	struct mfs_cache_object *object;
+	int err;
+
+	object = kmem_cache_alloc(mfs_cobject_cachep, GFP_KERNEL);
+	if (!object)
+		return ERR_PTR(-ENOMEM);
+
+	err = mfs_setup_object(object, inode, cache_path);
+	if (err) {
+		kmem_cache_free(mfs_cobject_cachep, object);
+		return ERR_PTR(err);
+	}
+
+	return object;
+}
+
+void mfs_free_object(void *data)
+{
+	struct mfs_cache_object *object = (struct mfs_cache_object *)data;
+
+	fput(object->cache_file);
+	kmem_cache_free(mfs_cobject_cachep, object);
+}
+
+int mfs_cache_init(void)
+{
+	mfs_cobject_cachep =
+		kmem_cache_create("mfs_object",
+				  sizeof(struct mfs_cache_object), 0,
+				  SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!mfs_cobject_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void mfs_cache_exit(void)
+{
+	kmem_cache_destroy(mfs_cobject_cachep);
+}
diff --git a/fs/mfs/data.c b/fs/mfs/data.c
new file mode 100644
index 0000000000000000000000000000000000000000..32d0b6947aa87336f121163024b5c5fb002c5be4
--- /dev/null
+++ b/fs/mfs/data.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include "internal.h"
+
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/completion.h>
+
+#include <trace/events/mfs.h>
+
+static struct mfs_file_info *mfs_file_info_alloc(struct file *lower, struct file *cache)
+{
+	struct mfs_file_info  *info = kzalloc(sizeof(struct mfs_file_info), GFP_KERNEL);
+
+	if (unlikely(!info))
+		return NULL;
+
+	info->lower = lower;
+	info->cache = cache;
+	return info;
+}
+
+static void mfs_file_info_free(struct mfs_file_info *info)
+{
+	fput(info->cache);
+	fput(info->lower);
+	kfree(info);
+}
+
+static int mfs_open(struct inode *inode, struct file *file)
+{
+	struct dentry *dentry = file_dentry(file);
+	struct mfs_sb_info *sbi = MFS_SB(inode->i_sb);
+	struct path lpath, cpath;
+	struct file *lfile, *cfile;
+	int flags = file->f_flags | MFS_OPEN_FLAGS;
+	struct mfs_file_info *file_info;
+	int err = 0;
+
+	trace_mfs_open(inode, file);
+	mfs_get_path(dentry, &lpath, &cpath);
+	lfile = dentry_open(&lpath, flags, current_cred());
+	if (IS_ERR(lfile)) {
+		err = PTR_ERR(lfile);
+		goto put_path;
+	}
+
+	cfile = dentry_open(&cpath, flags, current_cred());
+	if (IS_ERR(cfile)) {
+		err = PTR_ERR(cfile);
+		goto lfput;
+	}
+
+	if (support_event(sbi))
+		/* close the default readahead */
+		cfile->f_mode |= FMODE_RANDOM;
+	file_info = mfs_file_info_alloc(lfile, cfile);
+	if (!file_info) {
+		err = -ENOMEM;
+		goto cfput;
+	}
+
+	file->private_data = file_info;
+	goto put_path;
+cfput:
+	fput(cfile);
+lfput:
+	fput(lfile);
+put_path:
+	mfs_put_path(&lpath, &cpath);
+	return err;
+}
+
+static int mfs_release(struct inode *inode, struct file *file)
+{
+	trace_mfs_release(inode, file);
+	mfs_file_info_free(file->private_data);
+	return 0;
+}
+
+static loff_t mfs_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct mfs_file_info *file_info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct file *lfile, *cfile;
+	loff_t ret;
+
+	if (offset == 0) {
+		if (whence == SEEK_CUR)
+			return file->f_pos;
+
+		if (whence == SEEK_SET)
+			return vfs_setpos(file, 0, 0);
+	}
+
+	lfile = file_info->lower;
+	cfile = file_info->cache;
+
+	mfs_inode_lock(inode);
+	lfile->f_pos = file->f_pos;
+	ret = vfs_llseek(lfile, offset, whence);
+	if (ret < 0)
+		goto out;
+
+	cfile->f_pos = file->f_pos;
+	ret = vfs_llseek(cfile, offset, whence);
+	if (ret < 0)
+		goto out;
+
+	file->f_pos = lfile->f_pos;
+out:
+	mfs_inode_unlock(inode);
+	return ret;
+}
+
+static int mfs_flush(struct file *file, fl_owner_t id)
+{
+	struct mfs_file_info *file_info = file->private_data;
+	struct file *cfile;
+	int err = 0;
+
+	cfile = file_info->cache;
+	if (cfile->f_op->flush)
+		err = cfile->f_op->flush(cfile, id);
+
+	return err;
+}
+
+static int mfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct file *lfile;
+	struct mfs_file_info *file_info = file->private_data;
+
+	lfile = file_info->lower;
+	return iterate_dir(lfile, ctx);
+}
+
+enum range_status {
+	RANGE_DATA,
+	RANGE_HOLE,
+	RANGE_INVAL,
+};
+
+/* Continuous range with same status */
+struct range_t {
+	struct file *file;
+	loff_t off;
+	size_t max;
+	size_t len;
+	int status;
+};
+
+typedef int (*range_check) (struct range_t *r);
+
+struct range_ctx {
+	bool sync;  /* handle the miss case in sync/async way */
+	int op;
+	loff_t off;
+	size_t len;
+	struct file *file;
+	struct mfs_cache_object *object;
+	range_check checker; /* check method for range */
+};
+
+static int range_check_disk(struct range_t *r)
+{
+	loff_t off, to, start = r->off, end = r->off + r->max;
+	struct file *file = r->file;
+	int err = 0;
+
+	off  = vfs_llseek(file, start, SEEK_DATA);
+	if (off < 0) {
+		if (off == (loff_t)-ENXIO) {
+			r->len = end - start;
+			r->status = RANGE_HOLE;
+			goto out;
+		}
+		err = (int)off;
+		goto out;
+	}
+	if (off >= end) {
+		r->len = end - start;
+		r->status = RANGE_HOLE;
+		goto out;
+	}
+	if (off > start) {
+		r->len = end - off;
+		r->status = RANGE_HOLE;
+		goto out;
+	}
+	to = vfs_llseek(file, start, SEEK_HOLE);
+	if (to < 0) {
+		err = (int)to;
+		goto out;
+	}
+	if (to < end) {
+		r->len = to - start;
+		r->status = RANGE_DATA;
+		goto out;
+	}
+	r->len = end - start;
+	r->status = RANGE_DATA;
+out:
+	return err;
+}
+
+static int range_check_mem(struct range_t *r)
+{
+	struct inode *inode = file_inode(r->file);
+	struct address_space *mapping = inode->i_mapping;
+	loff_t cur_off = r->off, end = r->off + r->max;
+	struct folio *folio;
+
+	/* check from the first folio */
+	folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT);
+	if (IS_ERR(folio)) {
+		r->status = RANGE_HOLE;
+		cur_off += PAGE_SIZE;
+	} else {
+		r->status = RANGE_DATA;
+		cur_off += folio_size(folio);
+		folio_put(folio);
+	}
+
+	while (cur_off < end) {
+		folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT);
+		if (IS_ERR(folio)) {
+			if (r->status == RANGE_DATA)
+				break;
+			/* continuous hole */
+			cur_off += PAGE_SIZE;
+			continue;
+		}
+		if (r->status == RANGE_HOLE) {
+			folio_put(folio);
+			break;
+		}
+		cur_off += folio_size(folio);
+		folio_put(folio);
+	}
+
+	r->len = cur_off - r->off;
+	return 0;
+}
+
+static int mfs_check_range(struct range_ctx *ctx)
+{
+	struct mfs_sb_info *sbi = MFS_SB(ctx->object->mfs_inode->i_sb);
+	loff_t start = ctx->off, end = ctx->off + ctx->len;
+	struct file *file = ctx->file;
+	struct range_t r = { .file = file };
+	size_t len = ctx->len;
+	struct mfs_syncer syncer;
+	int err = 0, err2 = 0;
+
+	if (!support_event(sbi))
+		return 0;
+	if (!cache_is_ready(sbi))
+		return ctx->sync ? -EIO : 0;
+	if (!ctx->len)
+		return 0;
+
+	atomic_set(&syncer.notback, 1);
+	init_completion(&syncer.done);
+	INIT_LIST_HEAD(&syncer.head);
+	spin_lock_init(&syncer.list_lock);
+	atomic_set(&syncer.res, 0);
+	while (start < end) {
+		r.off = round_down(start, PAGE_SIZE);
+		r.max = len + (start - r.off);
+		r.len = 0;
+		r.status = RANGE_INVAL;
+		err = ctx->checker(&r);
+		if (err)
+			goto err;
+		switch (r.status) {
+		case RANGE_DATA:
+			start += r.len;
+			len -= r.len;
+			break;
+		case RANGE_HOLE:
+			start += r.len;
+			len -= r.len;
+			if (ctx->sync)
+				mfs_post_event_read(ctx->object, r.off, r.len, &syncer, ctx->op);
+			else
+				mfs_post_event_read(ctx->object, r.off, r.len, NULL, ctx->op);
+			break;
+		default:
+			pr_warn("invalid range status:%d\n", r.status);
+			WARN_ON_ONCE(1);
+			err = -EINVAL;
+			goto err;
+		}
+	}
+
+err:
+	if (atomic_dec_return(&syncer.notback) > 0) {
+		err2 = wait_for_completion_interruptible(&syncer.done);
+		if (err2)
+			mfs_cancel_syncer_events(ctx->object, &syncer);
+		else
+			err = atomic_read(&syncer.res);
+	}
+	return err ?: err2;
+}
+
+static ssize_t mfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *cfile, *file = iocb->ki_filp;
+	struct mfs_file_info *fi = file->private_data;
+	size_t isize = i_size_read(file_inode(file));
+	struct range_ctx ctx;
+	ssize_t rsize;
+	int err;
+
+	if (!iov_iter_count(to))
+		return 0;
+
+	cfile = fi->cache;
+	if (!cfile->f_op->read_iter)
+		return -EINVAL;
+
+	(void)get_file(cfile);
+	ctx.file = cfile;
+	ctx.object = file_inode(file)->i_private;
+	ctx.off = iocb->ki_pos;
+	ctx.op = MFS_OP_READ;
+	ctx.len = min_t(size_t, isize - ctx.off, iov_iter_count(to));
+	ctx.sync = false;
+	ctx.checker = range_check_mem;
+	if (need_sync_event(file_inode(file)->i_sb)) {
+		ctx.sync = true;
+		ctx.checker = range_check_disk;
+	}
+	err = mfs_check_range(&ctx);
+	if (err) {
+		fput(cfile);
+		return err;
+	}
+
+	iocb->ki_filp = cfile;
+	rsize = cfile->f_op->read_iter(iocb, to);
+	iocb->ki_filp = file;
+	fput(cfile);
+	return rsize;
+}
+
+static int mfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct mfs_file_info *fi = file->private_data;
+	struct file *cfile = fi->cache;
+	int err;
+
+	if (!cfile->f_op->mmap)
+		return -ENODEV;
+
+	(void)get_file(cfile);
+	vma->vm_file = cfile;
+	err = call_mmap(vma->vm_file, vma);
+	vma->vm_file = file;
+	fput(cfile);
+	if (err)
+		return err;
+
+	fi->cache_vm_ops = vma->vm_ops;
+	vma->vm_ops = &mfs_file_vm_ops;
+
+	return 0;
+}
+
+static vm_fault_t mfs_filemap_fault(struct vm_fault *vmf)
+{
+	struct file *cfile, *file = vmf->vma->vm_file;
+	struct mfs_file_info *fi = file->private_data;
+	size_t isize = i_size_read(file_inode(file));
+	const struct vm_operations_struct *cvm_ops;
+	struct vm_area_struct cvma, *vma, **vma_;
+	struct range_ctx ctx;
+	vm_fault_t ret;
+	int err;
+
+	vma = vmf->vma;
+	memcpy(&cvma, vma, sizeof(struct vm_area_struct));
+	cfile = fi->cache;
+	cvm_ops = fi->cache_vm_ops;
+	cvma.vm_file = cfile;
+
+	if (unlikely(!cvm_ops->fault))
+		return VM_FAULT_SIGBUS;
+	if ((vmf->pgoff << PAGE_SHIFT) >= isize)
+		return VM_FAULT_SIGBUS;
+
+	(void)get_file(cfile);
+	ctx.file = cfile;
+	ctx.object = file_inode(file)->i_private;
+	ctx.off = vmf->pgoff << PAGE_SHIFT;
+	ctx.len = min_t(size_t, isize - ctx.off, PAGE_SIZE);
+	ctx.op = MFS_OP_FAULT;
+	ctx.sync = false;
+	ctx.checker = range_check_mem;
+	if (need_sync_event(file_inode(file)->i_sb)) {
+		ctx.sync = true;
+		ctx.checker = range_check_disk;
+	}
+	err = mfs_check_range(&ctx);
+	if (err) {
+		fput(cfile);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/*
+	 * Dealing fault in mfs will call cachefile's fault eventually,
+	 * hence we will change vmf->vma->vm_file to cachefile.
+	 * When faulting concurrently, changing vmf->vma->vm_file is
+	 * visible to other threads. Hence we use cvma to narrow the
+	 * visibility. vmf->vma is const, so we use **vma_ to change.
+	 */
+	vma_ = (struct vm_area_struct **)&vmf->vma;
+	*vma_ = &cvma;
+	ret = cvm_ops->fault(vmf);
+	*vma_ = vma;
+	fput(cfile);
+	return ret;
+}
+
+static vm_fault_t mfs_filemap_map_pages(struct vm_fault *vmf,
+					pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+	struct file *cfile, *file = vmf->vma->vm_file;
+	struct mfs_file_info *fi = file->private_data;
+	size_t isize = i_size_read(file_inode(file));
+	const struct vm_operations_struct *cvm_ops;
+	struct vm_area_struct cvma, *vma, **vma_;
+	struct range_ctx ctx;
+	vm_fault_t ret;
+	int err;
+
+	vma = vmf->vma;
+	memcpy(&cvma, vma, sizeof(struct vm_area_struct));
+	cfile = fi->cache;
+	cvm_ops = fi->cache_vm_ops;
+	cvma.vm_file = cfile;
+
+	if (unlikely(!cvm_ops->map_pages))
+		return 0;
+	if ((start_pgoff << PAGE_SHIFT) >= isize)
+		return 0;
+
+	(void)get_file(cfile);
+	ctx.file = cfile;
+	ctx.object = file_inode(file)->i_private;
+	ctx.off = start_pgoff << PAGE_SHIFT;
+	ctx.len = min_t(size_t, isize - ctx.off, (end_pgoff - start_pgoff) << PAGE_SHIFT);
+	ctx.op = MFS_OP_FAROUND;
+	ctx.sync = false;
+	ctx.checker = range_check_mem;
+	if (need_sync_event(file_inode(file)->i_sb)) {
+		ctx.sync = true;
+		ctx.checker = range_check_disk;
+	}
+	err = mfs_check_range(&ctx);
+	if (err) {
+		fput(cfile);
+		return 0;
+	}
+
+	vma_ = (struct vm_area_struct **)&vmf->vma;
+	*vma_ = &cvma;
+	ret = cvm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+	*vma_ = vma;
+	fput(cfile);
+
+	return ret;
+}
+
+static int mfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+	struct inode *inode = file_inode(file);
+	struct mfs_sb_info *sbi = MFS_SB(inode->i_sb);
+	struct mfs_file_info *fi;
+	struct file *cfile;
+	int ret;
+
+	/* avoid trigger readahead in event mode */
+	if (support_event(sbi))
+		return generic_fadvise(file, offset, len, advice);
+
+	fi = file->private_data;
+	cfile = fi->cache;
+	(void)get_file(cfile);
+
+	ret = vfs_fadvise(cfile, offset, len, advice);
+	fput(cfile);
+
+	return ret;
+}
+
+const struct file_operations mfs_dir_fops = {
+	.open		= mfs_open,
+	.iterate_shared	= mfs_readdir,
+	.release	= mfs_release,
+};
+
+const struct file_operations mfs_file_fops = {
+	.open		= mfs_open,
+	.release	= mfs_release,
+	.llseek		= mfs_llseek,
+	.read_iter	= mfs_read_iter,
+	.flush		= mfs_flush,
+	.mmap		= mfs_file_mmap,
+	.fadvise	= mfs_fadvise,
+};
+
+const struct vm_operations_struct mfs_file_vm_ops = {
+	.fault		= mfs_filemap_fault,
+	.map_pages	= mfs_filemap_map_pages,
+};
+
+const struct address_space_operations mfs_aops = {
+	.direct_IO	= noop_direct_IO,
+};
diff --git a/fs/mfs/dev.c b/fs/mfs/dev.c
new file mode 100644
index 0000000000000000000000000000000000000000..902f73b1c25c5c57ad968f9dae8381c50d3048b5
--- /dev/null
+++ b/fs/mfs/dev.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include "internal.h"
+
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+
+#include <trace/events/mfs.h>
+
+static DEFINE_MUTEX(mfs_dev_lock);
+static DEFINE_IDR(mfs_dev_minor);
+
+static int mfs_dev_major;
+static const struct class mfs_dev_class = {
+	.name	= "mfs",
+};
+static struct device *mfs_dev;
+
+static int mfs_dev_open(struct inode *inode, struct file *file)
+{
+	struct mfs_caches *caches;
+	struct mfs_sb_info *sbi;
+	unsigned minor = iminor(inode);
+
+	sbi = minor < U8_MAX ? idr_find(&mfs_dev_minor, minor) : NULL;
+	if (!sbi)
+		return -EOPNOTSUPP;
+	caches = &sbi->caches;
+	if (test_and_set_bit(MFS_CACHE_OPENED, &caches->flags))
+		return -EBUSY;
+	smp_mb__after_atomic();
+
+	/* not mounted or cleaned by umounting */
+	if (!test_bit(MFS_MOUNTED, &sbi->flags)) {
+		clear_bit(MFS_CACHE_OPENED, &caches->flags);
+		return -EBUSY;
+	}
+
+	file->private_data = sbi;
+	set_bit(MFS_CACHE_READY, &caches->flags);
+	return 0;
+}
+
+static int mfs_dev_release(struct inode *inode, struct file *file)
+{
+	struct mfs_sb_info *sbi = file->private_data;
+	struct mfs_caches *caches = &sbi->caches;
+
+	clear_bit(MFS_CACHE_READY, &caches->flags);
+	smp_mb__after_atomic();
+	mfs_cancel_all_events(sbi);
+	smp_mb__before_atomic();
+	clear_bit(MFS_CACHE_OPENED, &caches->flags);
+	return 0;
+}
+
+static ssize_t mfs_dev_read(struct file *file, char __user *buf,
+				size_t blen, loff_t *off)
+{
+	struct mfs_sb_info *sbi = file->private_data;
+	struct mfs_caches *caches = &sbi->caches;
+	XA_STATE(xas, &caches->events, caches->next_ev);
+	struct mfs_event *event;
+	struct mfs_msg *msg;
+	size_t n;
+	int ret = 0;
+
+	xas_lock(&xas);
+	event = mfs_pick_event(&xas, ULONG_MAX);
+	if (!event && caches->next_ev > 0) {
+		xas_set(&xas, 0);
+		event = mfs_pick_event(&xas, caches->next_ev - 1);
+	}
+	if (!event) {
+		xas_unlock(&xas);
+		return 0;
+	}
+	if (event->syncer)
+		get_mfs_event(event);
+	xas_unlock(&xas);
+
+	msg = &event->msg;
+	n = msg->len;
+	if (n > blen) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	ret = try_hook_fd(event);
+	if (ret < 0)
+		goto out;
+
+	msg->fd = ret;
+	ret = 0;
+	if (copy_to_user(buf, msg, n)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	xas_lock(&xas);
+	xas_clear_mark(&xas, MFS_EVENT_NEW);
+	caches->next_ev = xas.xa_index + 1;
+	if (!event->syncer)
+		xas_store(&xas, NULL);
+	xas_unlock(&xas);
+out:
+	put_mfs_event(event);
+	trace_mfs_dev_read(file, msg->opcode, msg->id, msg->fd);
+	return ret ? ret : n;
+}
+
+static __poll_t mfs_dev_poll(struct file *file,
+				 struct poll_table_struct *poll)
+{
+	struct mfs_sb_info *sbi = file->private_data;
+	struct mfs_caches *caches = &sbi->caches;
+	struct mfs_event *event;
+	XA_STATE(xas, &caches->events, 0);
+	__poll_t mask;
+
+	poll_wait(file, &caches->pollwq, poll);
+	mask = 0;
+
+	if (!xa_empty(&caches->events)) {
+		xas_lock(&xas);
+		xas_for_each_marked(&xas, event, ULONG_MAX, MFS_EVENT_NEW) {
+			mask |= EPOLLIN;
+			break;
+		}
+		xas_unlock(&xas);
+	}
+
+	return mask;
+}
+
+static long mfs_dev_ioctl(struct file *filp,
+			      unsigned int cmd, unsigned long arg)
+{
+	struct mfs_ioc_fsinfo fsinfo;
+	unsigned minor = iminor(file_inode(filp));
+	struct mfs_sb_info *sbi = minor < U8_MAX ?
+				  idr_find(&mfs_dev_minor, minor) : NULL;
+	if (!sbi)
+		return -EOPNOTSUPP;
+
+	if (cmd != MFS_IOC_FSINFO)
+		return -EINVAL;
+	if (!test_bit(MFS_MOUNTED, &sbi->flags))
+		return -EBUSY;
+
+	fsinfo.mode = sbi->mode;
+	if (copy_to_user((void __user *)arg, &fsinfo,
+			  sizeof(struct mfs_ioc_fsinfo)))
+		return -EFAULT;
+	return 0;
+}
+
+static const struct file_operations mfs_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= mfs_dev_open,
+	.release	= mfs_dev_release,
+	.read		= mfs_dev_read,
+	.poll		= mfs_dev_poll,
+	.unlocked_ioctl = mfs_dev_ioctl,
+};
+
+int mfs_fs_dev_init(struct super_block *sb)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+	struct device *dev;
+
+	mutex_lock(&mfs_dev_lock);
+	sbi->minor = idr_alloc(&mfs_dev_minor, sbi, 0, U8_MAX, GFP_KERNEL);
+	if (sbi->minor < 0) {
+		mutex_unlock(&mfs_dev_lock);
+		return sbi->minor;
+	}
+
+	dev = device_create(&mfs_dev_class, NULL,
+			    MKDEV(mfs_dev_major, sbi->minor), sbi,
+			    "mfs%u", sbi->minor);
+	if (IS_ERR(dev)) {
+		idr_remove(&mfs_dev_minor, sbi->minor);
+		sbi->minor = -1;
+		mutex_unlock(&mfs_dev_lock);
+		return PTR_ERR(dev);
+	}
+	mutex_unlock(&mfs_dev_lock);
+	return 0;
+}
+
+void mfs_fs_dev_exit(struct super_block *sb)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+
+	if (sbi->minor < 0)
+		return;
+	mutex_lock(&mfs_dev_lock);
+	device_destroy(&mfs_dev_class, MKDEV(mfs_dev_major, sbi->minor));
+	idr_remove(&mfs_dev_minor, sbi->minor);
+	mutex_unlock(&mfs_dev_lock);
+	sbi->minor = -1;
+}
+
+int mfs_dev_init(void)
+{
+	int ret;
+
+	mfs_dev_major = register_chrdev(0, "mfs-ctl", &mfs_dev_fops);
+	if (mfs_dev_major < 0)
+		return mfs_dev_major;
+
+	ret = class_register(&mfs_dev_class);
+	if (ret)
+		goto major_out;
+
+	mfs_dev = device_create(&mfs_dev_class, NULL,
+				MKDEV(mfs_dev_major, U8_MAX),
+				NULL, "mfs-ctl");
+	if (IS_ERR(mfs_dev)) {
+		ret = PTR_ERR(mfs_dev);
+		goto class_out;
+	}
+	return 0;
+
+class_out:
+	class_unregister(&mfs_dev_class);
+major_out:
+	unregister_chrdev(mfs_dev_major, "mfs-ctl");
+	return ret;
+}
+
+void mfs_dev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(mfs_dev))
+		device_destroy(&mfs_dev_class, MKDEV(mfs_dev_major, U8_MAX));
+	class_unregister(&mfs_dev_class);
+	if (mfs_dev_major > 0)
+		unregister_chrdev(mfs_dev_major, "mfs-ctl");
+}
diff --git a/fs/mfs/inode.c b/fs/mfs/inode.c
new file mode 100644
index 0000000000000000000000000000000000000000..e45a5da7a67cb5ab4eedd1f51310083405a953f7
--- /dev/null
+++ b/fs/mfs/inode.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include "internal.h"
+
+#include <linux/err.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+
+#include <trace/events/mfs.h>
+
+static int mfs_inode_eq(struct inode *inode, void *lower_target)
+{
+	return mfs_lower_inode(inode) == (struct inode *)lower_target;
+}
+
+static int mfs_inode_set(struct inode *inode, void *lower_target)
+{
+	return 0;
+}
+
+static struct inode *_mfs_get_inode(struct super_block *sb,
+					  struct path *lower_path,
+					  struct path *cache_path)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+	struct inode *ret, *lower_inode, *cache_inode;
+
+	lower_inode = d_inode(lower_path->dentry);
+	cache_inode = d_inode(cache_path->dentry);
+
+	/* lower file system cannot change */
+	if (lower_inode->i_sb != sbi->lower.dentry->d_sb) {
+		ret = ERR_PTR(-EXDEV);
+		goto out;
+	}
+
+	/* check consistency: mode and size */
+	if ((lower_inode->i_mode & S_IFMT) != (cache_inode->i_mode & S_IFMT)) {
+		ret = ERR_PTR(-EUCLEAN);
+		goto out;
+	}
+	if (S_ISREG(lower_inode->i_mode)
+		&& lower_inode->i_size != cache_inode->i_size) {
+		ret = ERR_PTR(-EUCLEAN);
+		goto out;
+	}
+
+	/* allocate new inode for mfs */
+	ret = mfs_iget(sb, lower_inode, cache_path);
+out:
+	return ret;
+}
+
+static int _lookup_create(struct path *lpath, struct path *parent_cpath,
+			      const char *name, struct path *cpath)
+{
+	struct dentry *ldentry, *parent_cdentry, *dentry;
+	struct inode *linode, *cdir;
+	int ret = 0, _ret;
+
+	ldentry = lpath->dentry;
+	parent_cdentry = parent_cpath->dentry;
+	linode = d_inode(ldentry);
+	cdir = d_inode(parent_cpath->dentry);
+
+	inode_lock_nested(cdir, I_MUTEX_PARENT);
+retry:
+	dentry = lookup_one_len(name, parent_cdentry, strlen(name));
+	if (IS_ERR(dentry)) {
+		ret = PTR_ERR(dentry);
+		goto out;
+	}
+
+	cpath->mnt = mntget(parent_cpath->mnt);
+	cpath->dentry = dentry;
+	if (d_is_positive(dentry))
+		goto out;
+
+	if (d_is_dir(ldentry)) {
+		ret = vfs_mkdir(&nop_mnt_idmap, cdir, dentry, linode->i_mode);
+		if (ret)
+			goto new_err;
+		/*
+		 * In the event that the filesystem does not use the @dentry
+		 * but leaves it negative or unhashes it.
+		 */
+		if (unlikely(d_unhashed(dentry))) {
+			mntput(parent_cpath->mnt);
+			dput(dentry);
+			goto retry;
+		}
+	} else {
+		/* dir or file, symlink will be considerred the regular file */
+		ret = vfs_create(&nop_mnt_idmap, cdir, dentry, linode->i_mode, true);
+		if (ret)
+			goto new_err;
+		ret = vfs_truncate(cpath, linode->i_size);
+		if (ret)
+			goto truncate_err;
+	}
+	goto out;
+
+truncate_err:
+	_ret = vfs_unlink(&nop_mnt_idmap, cdir, dentry, NULL);
+	if (_ret)
+		pr_err("cleanup failed for file:%s, err:%d\n", name, _ret);
+new_err:
+	mntput(parent_cpath->mnt);
+	dput(dentry);
+out:
+	inode_unlock(cdir);
+	return ret;
+}
+
+static struct dentry *mfs_lookup(struct inode *dir, struct dentry *dentry,
+				    unsigned int flag)
+{
+	struct path parent_lpath, parent_cpath, lpath, cpath;
+	struct dentry *ret, *parent;
+	struct inode *inode;
+	const char *name;
+	int err;
+
+	trace_mfs_lookup(dir, dentry, flag);
+	parent = dget_parent(dentry);
+	mfs_get_path(parent, &parent_lpath, &parent_cpath);
+	err = mfs_alloc_dentry_info(dentry);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto out;
+	}
+	/* lookup from lower layer */
+	name = dentry->d_name.name;
+	err = vfs_path_lookup(parent_lpath.dentry,
+			      parent_lpath.mnt,
+			      name, 0, &lpath);
+	if (err) {
+		ret = ERR_PTR(err);
+		mfs_free_dentry_info(dentry);
+		goto out;
+	}
+	/* check from cache layer */
+	err = vfs_path_lookup(parent_cpath.dentry,
+			      parent_cpath.mnt,
+			      name, 0, &cpath);
+	if (err) {
+		if (err != -ENOENT) {
+cdentry_fail:
+			ret = ERR_PTR(err);
+			path_put(&lpath);
+			mfs_free_dentry_info(dentry);
+			goto out;
+		}
+		err = _lookup_create(&lpath, &parent_cpath, name, &cpath);
+		if (err)
+			goto cdentry_fail;
+	}
+	/* build the inode from lower layer */
+	inode = _mfs_get_inode(dir->i_sb, &lpath, &cpath);
+	if (IS_ERR(inode)) {
+		path_put(&lpath);
+		path_put(&cpath);
+		mfs_free_dentry_info(dentry);
+		ret = ERR_PTR(PTR_ERR(inode));
+		goto out;
+	}
+	mfs_install_path(dentry, &lpath, &cpath);
+	ret = d_splice_alias(inode, dentry);
+	if (IS_ERR(ret)) {
+		path_put(&lpath);
+		path_put(&cpath);
+		mfs_free_dentry_info(dentry);
+	}
+out:
+	mfs_put_path(&parent_lpath, &parent_cpath);
+	dput(parent);
+	return ret;
+}
+
+static int mfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+		       struct kstat *stat, u32 request_mask,
+		       unsigned int query_flags)
+{
+	struct mfs_inode *vi = MFS_I(d_inode(path->dentry));
+
+	generic_fillattr(idmap, request_mask, vi->lower, stat);
+	return 0;
+}
+
+static const char *mfs_get_link(struct dentry *dentry,
+				struct inode *inode,
+				struct delayed_call *done)
+{
+	struct mfs_sb_info *sbi = MFS_SB(inode->i_sb);
+	struct path lpath, cpath;
+	struct dentry *ldentry;
+	const char *p;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	mfs_get_path(dentry, &lpath, &cpath);
+	ldentry = lpath.dentry;
+	p = vfs_get_link(ldentry, done);
+	mfs_put_path(&lpath, &cpath);
+
+	if (IS_ERR(p) || p[0] != '/')
+		return p;
+	if (strlen(p) <= strlen(sbi->mtree))
+		return ERR_PTR(-EXDEV);
+	if (strncmp(sbi->mtree, p, strlen(sbi->mtree)) != 0)
+		return ERR_PTR(-EXDEV);
+	p += strlen(sbi->mtree);
+	if (p[0] != '/')
+		return ERR_PTR(-EXDEV);
+	p += 1;
+	return p;
+}
+
+static const struct inode_operations mfs_dir_iops = {
+	.lookup		= mfs_lookup,
+	.getattr	= mfs_getattr,
+};
+
+static const struct inode_operations mfs_symlink_iops = {
+	.getattr	= mfs_getattr,
+	.get_link	= mfs_get_link,
+};
+
+static const struct inode_operations mfs_file_iops = {
+	.getattr	= mfs_getattr,
+};
+
+struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode,
+			 struct path *cache_path)
+{
+	struct inode *inode, *cache_inode = d_inode(cache_path->dentry);
+	struct mfs_inode *vi;
+	int err;
+
+	if (!igrab(lower_inode))
+		return ERR_PTR(-ESTALE);
+	if (!igrab(cache_inode)) {
+		err = -ESTALE;
+		goto err_put_lower;
+	}
+	inode = iget5_locked(sb, lower_inode->i_ino,
+			     mfs_inode_eq,
+			     mfs_inode_set,
+			     lower_inode);
+	if (!inode) {
+		err = -ENOMEM;
+		goto err_put_cache;
+	}
+	/* found in cache */
+	if (!(inode->i_state & I_NEW)) {
+		iput(cache_inode);
+		iput(lower_inode);
+		return inode;
+	}
+	/* new inode */
+	vi = MFS_I(inode);
+	inode->i_ino = lower_inode->i_ino;
+	switch (lower_inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &mfs_file_iops;
+		inode->i_fop = &mfs_file_fops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &mfs_dir_iops;
+		inode->i_fop = &mfs_dir_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &mfs_symlink_iops;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto err_inode;
+	}
+	inode->i_mapping->a_ops = &mfs_aops;
+	if (S_ISREG(cache_inode->i_mode)) {
+		vi->vfs_inode.i_private = mfs_alloc_object(inode, cache_path);
+		if (IS_ERR(vi->vfs_inode.i_private)) {
+			err = PTR_ERR(vi->vfs_inode.i_private);
+			vi->vfs_inode.i_private = NULL;
+			goto err_inode;
+		}
+	}
+	vi->lower = lower_inode;
+	vi->cache = cache_inode;
+	fsstack_copy_attr_all(inode, lower_inode);
+	fsstack_copy_inode_size(inode, lower_inode);
+	unlock_new_inode(inode);
+	return inode;
+err_inode:
+	iget_failed(inode);
+err_put_cache:
+	iput(cache_inode);
+err_put_lower:
+	iput(lower_inode);
+	return ERR_PTR(err);
+}
diff --git a/fs/mfs/internal.h b/fs/mfs/internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c6c0a38c68354583903d49949fab52de6d075f
--- /dev/null
+++ b/fs/mfs/internal.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#ifndef _MFS_INTERNAL_H
+#define _MFS_INTERNAL_H
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/container_of.h>
+#include <linux/spinlock_types.h>
+#include <linux/xarray.h>
+#include <linux/wait.h>
+#include <linux/completion.h>
+#include <linux/types.h>
+#include <linux/mfs.h>
+
+#define MFS_NAME "mfs"
+
+#define MFS_OPEN_FLAGS (O_NOATIME)
+#define MFS_EVENT_NEW	XA_MARK_1
+
+/* mfs_sb_info flags */
+#define MFS_MOUNTED 0
+
+/* mfs_caches flags */
+#define MFS_CACHE_READY 0
+#define MFS_CACHE_OPENED 1
+
+struct mfs_cache_object {
+	struct file *cache_file;
+	struct inode *mfs_inode;
+
+	struct rw_semaphore rwsem;
+	int fd;  /* file handle */
+	struct file *anon_file;  /* related with fd */
+};
+
+struct mfs_syncer {
+	atomic_t notback;
+	struct list_head head;
+	spinlock_t list_lock;
+	struct completion done;
+	atomic_t res;
+};
+
+struct mfs_event {
+	refcount_t ref;
+	struct mfs_cache_object *object;
+	struct mfs_syncer *syncer;
+	struct list_head link;
+	struct mfs_msg msg;
+};
+
+struct mfs_caches {
+	struct xarray events;
+	wait_queue_head_t pollwq;
+	unsigned long next_msg;
+	unsigned long next_ev;
+	unsigned long flags;
+};
+
+struct mfs_sb_info {
+	int mode;
+	char *mtree;
+	char *cachedir;
+	struct path lower;
+	struct path cache;
+
+	int minor;
+
+	unsigned long flags;
+	struct super_block *sb;
+
+	struct mfs_caches caches;
+};
+
+struct mfs_inode {
+	struct inode *lower;
+	struct inode *cache;
+	struct mutex lock;
+	struct inode vfs_inode;
+};
+
+struct mfs_file_info {
+	struct file *lower;
+	struct file *cache;
+	const struct vm_operations_struct *cache_vm_ops;
+};
+
+struct mfs_dentry_info {
+	spinlock_t lock;
+	struct path lower;
+	struct path cache;
+};
+
+#define MFS_SB(sb) ((struct mfs_sb_info *)(sb)->s_fs_info)
+#define MFS_I(ptr) container_of(ptr, struct mfs_inode, vfs_inode)
+#define MFS_D(dent) ((struct mfs_dentry_info *)(dent)->d_fsdata)
+
+extern const struct file_operations mfs_dir_fops;
+extern const struct file_operations mfs_file_fops;
+extern const struct address_space_operations mfs_aops;
+extern const struct vm_operations_struct mfs_file_vm_ops;
+
+static inline struct inode *mfs_lower_inode(const struct inode *i)
+{
+	return MFS_I(i)->lower;
+}
+
+static inline void pathcpy(struct path *dst, const struct path *src)
+{
+	dst->dentry = src->dentry;
+	dst->mnt = src->mnt;
+}
+
+/*
+ * dent: mfs vfs dentry
+ */
+static inline void mfs_get_path(const struct dentry *dent,
+				    struct path *lpath,
+				    struct path *cpath)
+{
+	spin_lock(&MFS_D(dent)->lock);
+	pathcpy(lpath, &MFS_D(dent)->lower);
+	path_get(lpath);
+	pathcpy(cpath, &MFS_D(dent)->cache);
+	path_get(cpath);
+	spin_unlock(&MFS_D(dent)->lock);
+}
+
+static inline void mfs_put_path(struct path *lpath, struct path *cpath)
+{
+	path_put(lpath);
+	path_put(cpath);
+}
+
+static inline void mfs_install_path(const struct dentry *dent,
+					 struct path *lpath,
+					 struct path *cpath)
+{
+	spin_lock(&MFS_D(dent)->lock);
+	pathcpy(&MFS_D(dent)->lower, lpath);
+	pathcpy(&MFS_D(dent)->cache, cpath);
+	spin_unlock(&MFS_D(dent)->lock);
+}
+
+static inline void mfs_release_path(const struct dentry *dent)
+{
+	struct path lpath, cpath;
+
+	if (!dent || !dent->d_fsdata)
+		return;
+	spin_lock(&MFS_D(dent)->lock);
+	pathcpy(&lpath, &MFS_D(dent)->lower);
+	pathcpy(&cpath, &MFS_D(dent)->cache);
+	MFS_D(dent)->lower.dentry = NULL;
+	MFS_D(dent)->lower.mnt = NULL;
+	MFS_D(dent)->cache.dentry = NULL;
+	MFS_D(dent)->cache.mnt = NULL;
+	path_put(&lpath);
+	path_put(&cpath);
+	spin_unlock(&MFS_D(dent)->lock);
+}
+
+static inline void mfs_inode_lock(struct inode *inode)
+{
+	mutex_lock(&MFS_I(inode)->lock);
+}
+
+static inline void mfs_inode_unlock(struct inode *inode)
+{
+	mutex_unlock(&MFS_I(inode)->lock);
+}
+
+static inline bool support_event(struct mfs_sb_info *sbi)
+{
+	return sbi->mode != MFS_MODE_NONE;
+}
+
+static inline bool need_sync_event(struct super_block *sb)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+
+	return sbi->mode == MFS_MODE_REMOTE;
+}
+
+static inline bool cache_is_ready(struct mfs_sb_info *sbi)
+{
+	return test_bit(MFS_CACHE_READY, &sbi->caches.flags);
+}
+
+static inline void get_mfs_event(struct mfs_event *event)
+{
+	refcount_inc(&event->ref);
+}
+
+static inline void put_mfs_event(struct mfs_event *event)
+{
+	if (refcount_dec_and_test(&event->ref)) {
+		iput(event->object->mfs_inode);
+		kfree(event);
+	}
+}
+
+struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode,
+			  struct path *cache_path);
+int mfs_alloc_dentry_info(struct dentry *dentry);
+void mfs_free_dentry_info(struct dentry *dentry);
+
+int mfs_fs_dev_init(struct super_block *sb);
+void mfs_fs_dev_exit(struct super_block *sb);
+int mfs_dev_init(void);
+void mfs_dev_exit(void);
+
+struct mfs_event *mfs_pick_event(struct xa_state *xas,
+				 unsigned long xa_max);
+void mfs_post_event_read(struct mfs_cache_object *object,
+			       loff_t off, uint64_t len,
+			       struct mfs_syncer *syncer, int op);
+void mfs_destroy_events(struct super_block *sb);
+void mfs_cancel_syncer_events(struct mfs_cache_object *object,
+			      struct mfs_syncer *syncer);
+void mfs_cancel_all_events(struct mfs_sb_info *sbi);
+int try_hook_fd(struct mfs_event *event);
+struct mfs_cache_object *mfs_alloc_object(struct inode *inode,
+					       struct path *cache_path);
+void mfs_free_object(void *data);
+int mfs_cache_init(void);
+void mfs_cache_exit(void);
+
+#endif
diff --git a/fs/mfs/super.c b/fs/mfs/super.c
new file mode 100644
index 0000000000000000000000000000000000000000..91b1b13b0657290062bdf1f5733ce9085f8559a3
--- /dev/null
+++ b/fs/mfs/super.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include "internal.h"
+
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mfs.h>
+
+/*
+ * Used for alloc_inode
+ */
+static struct kmem_cache *mfs_inode_cachep;
+
+/*
+ * Used for dentry info
+ */
+static struct kmem_cache *mfs_dentry_cachep;
+
+static void mfs_init_once(void *obj)
+{
+	struct mfs_inode *i = obj;
+
+	inode_init_once(&i->vfs_inode);
+}
+
+static struct inode *mfs_alloc_inode(struct super_block *sb)
+{
+	struct mfs_inode *vi = alloc_inode_sb(sb, mfs_inode_cachep, GFP_KERNEL);
+
+	if (!vi)
+		return NULL;
+	memset(vi, 0, offsetof(struct mfs_inode, vfs_inode));
+	mutex_init(&vi->lock);
+	return &vi->vfs_inode;
+}
+
+static void mfs_free_inode(struct inode *inode)
+{
+	struct mfs_inode *vi = MFS_I(inode);
+
+	kmem_cache_free(mfs_inode_cachep, vi);
+}
+
+static void mfs_evict_inode(struct inode *inode)
+{
+	struct mfs_inode *vi = MFS_I(inode);
+	struct inode *lower_inode = vi->lower;
+	struct inode *cache_inode = vi->cache;
+
+	truncate_inode_pages_final(&inode->i_data);
+	if (inode->i_private)
+		mfs_free_object(inode->i_private);
+	clear_inode(inode);
+	if (lower_inode) {
+		vi->lower = NULL;
+		iput(lower_inode);
+	}
+	if (cache_inode) {
+		vi->cache = NULL;
+		iput(cache_inode);
+	}
+}
+
+int mfs_alloc_dentry_info(struct dentry *dentry)
+{
+	struct mfs_dentry_info *info =
+		kmem_cache_zalloc(mfs_dentry_cachep, GFP_ATOMIC);
+
+	if (!info)
+		return -ENOMEM;
+	spin_lock_init(&info->lock);
+	dentry->d_fsdata = info;
+	return 0;
+}
+
+void mfs_free_dentry_info(struct dentry *dentry)
+{
+	if (!dentry || !dentry->d_fsdata)
+		return;
+
+	kmem_cache_free(mfs_dentry_cachep, dentry->d_fsdata);
+	dentry->d_fsdata = NULL;
+}
+
+static void mfs_d_release(struct dentry *dentry)
+{
+	/* for root, the path will release with super block */
+	if (!IS_ROOT(dentry))
+		mfs_release_path(dentry);
+
+	mfs_free_dentry_info(dentry);
+}
+
+static const struct dentry_operations mfs_dops = {
+	.d_release	= mfs_d_release,
+};
+
+static int mfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct mfs_sb_info *sbi = MFS_SB(dentry->d_sb);
+	int err = vfs_statfs(&sbi->cache, buf);
+
+	buf->f_type = MFS_SUPER_MAGIC;
+	/* Use the reserved slot to keep the device id */
+	buf->f_spare[0] = sbi->minor;
+	return err;
+}
+
+static int mfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct mfs_sb_info *sbi = MFS_SB(root->d_sb);
+
+	if (sbi->mtree)
+		seq_show_option(seq, "mtree", sbi->mtree);
+	if (sbi->cachedir)
+		seq_show_option(seq, "cachedir", sbi->cachedir);
+	switch (sbi->mode) {
+	case MFS_MODE_NONE:
+		seq_puts(seq, ",mode=none");
+		break;
+	case MFS_MODE_LOCAL:
+		seq_puts(seq, ",mode=local");
+		break;
+	case MFS_MODE_REMOTE:
+		seq_puts(seq, ",mode=remote");
+		break;
+	}
+	return 0;
+}
+
+static const struct super_operations mfs_sops = {
+	.alloc_inode	= mfs_alloc_inode,
+	.free_inode	= mfs_free_inode,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= mfs_evict_inode,
+	.statfs		= mfs_statfs,
+	.show_options	= mfs_show_options,
+};
+
+enum {
+	Opt_mtree,
+	Opt_cachedir,
+	Opt_mode,
+};
+
+static const struct constant_table mfs_param_mode[] = {
+	{"none", MFS_MODE_NONE},
+	{"local", MFS_MODE_LOCAL},
+	{"remote", MFS_MODE_REMOTE},
+	{}
+};
+
+static const struct fs_parameter_spec mfs_fs_parameters[] = {
+	fsparam_string("mtree", Opt_mtree),
+	fsparam_string("cachedir", Opt_cachedir),
+	fsparam_enum("mode", Opt_mode, mfs_param_mode),
+	{}
+};
+
+static char *remove_trailing(char *s, char c)
+{
+	size_t size;
+	char *end;
+
+	size = strlen(s);
+	if (!size)
+		return s;
+
+	end = s + size - 1;
+	while (end >= s && c == *end)
+		end--;
+	*(end + 1) = '\0';
+	return s;
+}
+
+static char *_acquire_set_path(char *inputpath, struct path *target)
+{
+	char *p, *realp, *path;
+	char *res;
+	int ret = 0;
+
+	p = kstrdup(inputpath, GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+	realp = remove_trailing(p, '/');
+	if (strlen(realp) == 0) {
+		kfree(p);
+		return ERR_PTR(-EINVAL);
+	}
+	ret = kern_path(realp, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, target);
+	kfree(p);
+	if (ret)
+		return ERR_PTR(ret);
+
+	path = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!path) {
+		path_put(target);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	realp = d_path(target, path, PATH_MAX);
+	if (IS_ERR(realp)) {
+		path_put(target);
+		res = realp;
+		goto free;
+	}
+
+	res = kstrdup(realp, GFP_KERNEL);
+	if (!res) {
+		path_put(target);
+		res = ERR_PTR(-ENOMEM);
+	}
+free:
+	kfree(path);
+	return res;
+}
+
+static int mfs_fc_parse_param(struct fs_context *fc,
+				    struct fs_parameter *param)
+{
+	struct mfs_sb_info *sbi = fc->s_fs_info;
+	struct fs_parse_result result;
+	struct path target;
+	char *p;
+	int opt;
+
+	opt = fs_parse(fc, mfs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_mtree:
+		p = _acquire_set_path(param->string, &target);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+		sbi->mtree = p;
+		pathcpy(&sbi->lower, &target);
+		break;
+	case Opt_cachedir:
+		p = _acquire_set_path(param->string, &target);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+		sbi->cachedir = p;
+		pathcpy(&sbi->cache, &target);
+		break;
+	case Opt_mode:
+		sbi->mode = result.int_32;
+		break;
+	default:
+		return -ENOPARAM;
+	}
+	return 0;
+}
+
+static int mfs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+	struct inode *inode;
+	int err = 0;
+
+	if (!sbi->cachedir || !sbi->mtree) {
+		pr_err("Lack of mtree or cachedir option.\n");
+		return -EINVAL;
+	}
+
+	if (sbi->mode != MFS_MODE_REMOTE) {
+		if (strcmp(sbi->cachedir, sbi->mtree)) {
+			pr_err("local/none mode require the same mtree and cachedir.\n");
+			return -EINVAL;
+		}
+	} else {
+		if (!strcmp(sbi->cachedir, sbi->mtree)) {
+			pr_err("remote mode require different mtree and cachedir.\n");
+			return -EINVAL;
+		}
+		if (strlen(sbi->cachedir) > strlen(sbi->mtree) &&
+			strncmp(sbi->mtree, sbi->cachedir, strlen(sbi->mtree)) == 0) {
+			pr_err("remote mode mtree should not be parent of cachedir.\n");
+			return -EINVAL;
+		}
+	}
+
+	sb->s_stack_depth = max(sbi->lower.mnt->mnt_sb->s_stack_depth,
+				sbi->cache.mnt->mnt_sb->s_stack_depth) + 1;
+	if (sb->s_stack_depth > 1) {
+		pr_err("cannot be stacked on other stackable file system.\n");
+		return -EINVAL;
+	}
+
+	sb->s_magic = MFS_SUPER_MAGIC;
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_op = &mfs_sops;
+	sb->s_d_op = &mfs_dops;
+	err = super_setup_bdi(sb);
+	if (err)
+		return err;
+
+	if (support_event(sbi)) {
+		err = mfs_fs_dev_init(sb);
+		if (err)
+			return err;
+	}
+
+	inode = mfs_iget(sb, d_inode(sbi->lower.dentry), &sbi->cache);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_exit;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_iput;
+	}
+
+	err = mfs_alloc_dentry_info(sb->s_root);
+	if (err)
+		goto out_dput;
+	mfs_install_path(sb->s_root, &sbi->lower, &sbi->cache);
+	sbi->sb = sb;
+	set_bit(MFS_MOUNTED, &sbi->flags);
+	return 0;
+out_dput:
+	dput(sb->s_root);
+out_iput:
+	iput(inode);
+out_exit:
+	if (support_event(sbi))
+		mfs_fs_dev_exit(sb);
+	return err;
+}
+
+static int mfs_fc_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, mfs_fc_fill_super);
+}
+
+static int mfs_reconfigure(struct fs_context *fc)
+{
+	return -EOPNOTSUPP;
+}
+
+static void mfs_fc_free(struct fs_context *fc)
+{
+	struct mfs_sb_info *sbi = fc->s_fs_info;
+
+	if (!sbi)
+		return;
+
+	if (sbi->mtree) {
+		path_put(&sbi->lower);
+		kfree(sbi->mtree);
+	}
+	if (sbi->cachedir) {
+		path_put(&sbi->cache);
+		kfree(sbi->cachedir);
+	}
+	kfree(sbi);
+}
+
+static const struct fs_context_operations mfs_context_ops = {
+	.parse_param	= mfs_fc_parse_param,
+	.get_tree	= mfs_fc_get_tree,
+	.reconfigure	= mfs_reconfigure,
+	.free		= mfs_fc_free,
+};
+
+static int mfs_init_fs_context(struct fs_context *fc)
+{
+	struct mfs_sb_info *sbi;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	init_waitqueue_head(&sbi->caches.pollwq);
+	xa_init_flags(&sbi->caches.events, XA_FLAGS_ALLOC);
+	sbi->minor = -1;
+	fc->s_fs_info = sbi;
+	fc->ops = &mfs_context_ops;
+	return 0;
+}
+
+static void mfs_kill_sb(struct super_block *sb)
+{
+	struct mfs_sb_info *sbi = MFS_SB(sb);
+	struct mfs_caches *caches = &sbi->caches;
+
+	clear_bit(MFS_MOUNTED, &sbi->flags);
+	if (support_event(sbi)) {
+		while (test_bit(MFS_CACHE_OPENED, &caches->flags)) {
+			static DEFINE_RATELIMIT_STATE(busy_open, 30 * HZ, 1);
+
+			msleep(100);
+			if (!__ratelimit(&busy_open))
+				continue;
+			pr_warn("Pending until close the /dev/mfs%u...\n", sbi->minor);
+		}
+		mfs_fs_dev_exit(sb);
+	}
+	kill_anon_super(sb);
+	mfs_destroy_events(sb);
+	if (sbi->mtree) {
+		path_put(&sbi->lower);
+		kfree(sbi->mtree);
+	}
+	if (sbi->cachedir) {
+		path_put(&sbi->cache);
+		kfree(sbi->cachedir);
+	}
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static struct file_system_type mfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= MFS_NAME,
+	.init_fs_context = mfs_init_fs_context,
+	.kill_sb	= mfs_kill_sb,
+	.fs_flags	= 0,
+};
+MODULE_ALIAS_FS(MFS_NAME);
+
+static int __init init_mfs_fs(void)
+{
+	int err;
+
+	mfs_inode_cachep =
+		kmem_cache_create("mfs_inode",
+				  sizeof(struct mfs_inode), 0,
+				  SLAB_RECLAIM_ACCOUNT, mfs_init_once);
+	if (!mfs_inode_cachep)
+		return -ENOMEM;
+
+	mfs_dentry_cachep =
+		kmem_cache_create("mfs_dentry",
+				  sizeof(struct mfs_dentry_info), 0,
+				  SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!mfs_dentry_cachep) {
+		err = -ENOMEM;
+		goto err_dentryp;
+	}
+
+	err = mfs_cache_init();
+	if (err)
+		goto err_cache;
+
+	err = register_filesystem(&mfs_fs_type);
+	if (err)
+		goto err_register;
+
+	err = mfs_dev_init();
+	if (err)
+		goto err_dev;
+
+	pr_info("MFS module loaded\n");
+	return 0;
+err_dev:
+	unregister_filesystem(&mfs_fs_type);
+err_register:
+	mfs_cache_exit();
+err_cache:
+	kmem_cache_destroy(mfs_dentry_cachep);
+err_dentryp:
+	kmem_cache_destroy(mfs_inode_cachep);
+	return err;
+}
+
+static void __exit exit_mfs_fs(void)
+{
+	mfs_dev_exit();
+	unregister_filesystem(&mfs_fs_type);
+
+	/* Make sure all delayed rcu free inodes are safe to be destroyed. */
+	rcu_barrier();
+	mfs_cache_exit();
+	kmem_cache_destroy(mfs_dentry_cachep);
+	kmem_cache_destroy(mfs_inode_cachep);
+	pr_info("MFS module unload\n");
+}
+
+module_init(init_mfs_fs);
+module_exit(exit_mfs_fs);
+
+MODULE_AUTHOR("Hongbo Li <lihongbo22@huawei.com>");
+MODULE_AUTHOR("Xiaojia Huang <huangxiaojia2@huawei.com>");
+MODULE_DESCRIPTION("MFS filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/include/trace/events/mfs.h b/include/trace/events/mfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..5963888d0993e833b079bdf8337f45d63491803e
--- /dev/null
+++ b/include/trace/events/mfs.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mfs
+
+#if !defined(_TRACE_MFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MFS_H
+
+#include <linux/tracepoint.h>
+#include <linux/fs.h>
+
+TRACE_EVENT(mfs_lookup,
+	TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flag),
+	TP_ARGS(dir, dentry, flag),
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(ino_t,		ino)
+		__string(name,		dentry->d_name.name)
+		__field(unsigned int,	flag)
+	),
+	TP_fast_assign(
+		__entry->dev = dir->i_sb->s_dev;
+		__entry->ino = dir->i_ino;
+		__assign_str(name, dentry->d_name.name);
+		__entry->flag = flag;
+	),
+
+	TP_printk("dev=%d ino=%lu name=%s flag=%x",
+		MINOR(__entry->dev), __entry->ino, __get_str(name), __entry->flag)
+);
+
+DECLARE_EVENT_CLASS(mfs_file_normal,
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file),
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(mode_t,	mode)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->mode = file->f_mode;
+	),
+
+	TP_printk("dev=%d ino=%lu mode=%o",
+		MINOR(__entry->dev), __entry->ino, __entry->mode)
+);
+
+DEFINE_EVENT(mfs_file_normal, mfs_open,
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file)
+);
+
+DEFINE_EVENT(mfs_file_normal, mfs_release,
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file)
+);
+
+TRACE_EVENT(mfs_post_event_read,
+	TP_PROTO(struct inode *inode, loff_t off, uint64_t len, int op),
+	TP_ARGS(inode, off, len, op),
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(loff_t,	off)
+		__field(uint64_t,	len)
+		__field(int,	op)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->off = off;
+		__entry->len = len;
+		__entry->op = op;
+	),
+
+	TP_printk("(miss) dev=%d ino=%lu off=%lld len=%llu op=%d",
+		MINOR(__entry->dev), __entry->ino, __entry->off, __entry->len, __entry->op)
+);
+
+TRACE_EVENT(mfs_dev_read,
+	TP_PROTO(struct file *file, int op, uint32_t msgid, uint32_t fd),
+	TP_ARGS(file, op, msgid, fd),
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(ino_t,		ino)
+		__field(int,		op)
+		__field(uint32_t,	msgid)
+		__field(uint32_t,	fd)
+	),
+	TP_fast_assign(
+		__entry->dev = file->f_inode->i_sb->s_dev;
+		__entry->ino = file->f_inode->i_ino;
+		__entry->op = op;
+		__entry->msgid = msgid;
+		__entry->fd = fd;
+	),
+
+	TP_printk("dev=%d ino=%lu op=%d msgid=%u fd=%u",
+		MINOR(__entry->dev), __entry->ino, __entry->op, __entry->msgid, __entry->fd)
+);
+
+#endif /* _TRACE_MFS_H */
+
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f5dcdc7bdc91d612f8fc4c7b40135..4ca73708ed9edc69252985742f6de6d54908dabd 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -37,6 +37,7 @@
 #define HOSTFS_SUPER_MAGIC	0x00c0ffee
 #define OVERLAYFS_SUPER_MAGIC	0x794c7630
 #define FUSE_SUPER_MAGIC	0x65735546
+#define MFS_SUPER_MAGIC		0x85428370
 
 #define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
diff --git a/include/uapi/linux/mfs.h b/include/uapi/linux/mfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7d5882b5e500082920cc4208aaaf10ebf615a64
--- /dev/null
+++ b/include/uapi/linux/mfs.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_MFS_H
+#define _UAPI_LINUX_MFS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+enum mfs_opcode {
+	MFS_OP_READ = 0,
+	MFS_OP_FAULT,
+	MFS_OP_FAROUND,
+};
+
+enum {
+	MFS_MODE_NONE = 0,
+	MFS_MODE_LOCAL,
+	MFS_MODE_REMOTE,
+};
+
+struct mfs_ioc_ra {
+	__u64 off;
+	__u64 len;
+};
+
+struct mfs_ioc_done {
+	__u32 id;
+	__u32 ret;
+};
+
+struct mfs_ioc_rpath {
+	__u16 max;
+	__u16 len;
+	__u8 d[];
+};
+
+#define MFS_IOC_RA	_IOW(0xbc,	1, struct mfs_ioc_ra)
+#define MFS_IOC_DONE	_IOW(0xbc,	2, struct mfs_ioc_done)
+#define MFS_IOC_RPATH	_IOWR(0xbc,	3, struct mfs_ioc_rpath)
+
+struct mfs_ioc_fsinfo {
+	__u8 mode;  /* 0: none, 1: local, 2: remote */
+};
+
+#define MFS_IOC_FSINFO	_IOR(0xbd,	1, struct  mfs_ioc_fsinfo)
+
+struct mfs_msg {
+	__u8 version;
+	__u8 opcode;
+	__u16 len;
+	__u32 fd;
+	__u32 id;
+	__u8 data[];
+};
+
+struct mfs_read {
+	__u64 off;
+	__u64 len;
+	__s32 pid;
+};
+
+#endif /* _UAPI_LINUX_MFS_H */
diff --git a/tools/mfs/.gitignore b/tools/mfs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..55428a223113f0f0ff58d49fba34b6afd86d5cce
--- /dev/null
+++ b/tools/mfs/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mfsd
diff --git a/tools/mfs/Makefile b/tools/mfs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..a6e3e970b63e0bf6e43337fbb727e354e8f6df4d
--- /dev/null
+++ b/tools/mfs/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mfs demo
+
+CFLAGS = -Wall -Wextra
+
+PROGS := mfsd
+
+all: $(PROGS)
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+clean:
+	$(RM) $(PROGS)
diff --git a/tools/mfs/mfsd.c b/tools/mfs/mfsd.c
new file mode 100644
index 0000000000000000000000000000000000000000..865b765c31ed45f6009ee291b0a72e4cff74b1fe
--- /dev/null
+++ b/tools/mfs/mfsd.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User-space demo of mfs
+ *
+ * Example use:
+ * ./mfsd [mfs_mountpoint]
+ * mfsd.c demostrates how to poll mfs device, read the events,
+ * parse the events, process the events according to user mode
+ * and trigger the ioctls mfs supported.
+ *
+ * See Documentation/filesystems/mfs.rst
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/statfs.h>
+
+#include "../../include/uapi/linux/mfs.h"
+#include "../../include/uapi/linux/magic.h"
+
+#define pr_err(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
+
+static int mfs_mode = -1;
+
+static int process_local_read(struct mfs_msg *msg)
+{
+	struct mfs_read *read = (struct mfs_read *)msg->data;
+	struct mfs_ioc_ra ra;
+	int fd = msg->fd;
+	int ret;
+
+	ra.off = read->off;
+	ra.len = read->len;
+
+	ret = ioctl(fd, MFS_IOC_RA, &ra);
+	if (ret)
+		perror("ioctl MFS_IOC_RA failed");
+
+	return ret;
+}
+
+static int process_remote_read(struct mfs_msg *msg)
+{
+	struct mfs_ioc_rpath *rpath;
+	struct mfs_ioc_done done;
+	int fd = msg->fd;
+	int ret;
+
+	rpath = malloc(sizeof(struct mfs_ioc_rpath) + 1024);
+
+	if (!rpath) {
+		pr_err("malloc for path failed\n");
+		return -1;
+	}
+	rpath->max = 1024;
+	ret = ioctl(fd, MFS_IOC_RPATH, (unsigned long)rpath);
+	if (ret) {
+		free(rpath);
+		perror("ioctl failed");
+		return -1;
+	}
+	free(rpath);
+
+	done.id = msg->id;
+	done.ret = 0;
+	ret = ioctl(fd, MFS_IOC_DONE, (unsigned long)&done);
+	if (ret)
+		perror("failed to ioctl MFS_IOC_DONE");
+
+	return ret;
+}
+
+static int process_read(struct mfs_msg *msg)
+{
+	int ret;
+
+	if (mfs_mode == MFS_MODE_REMOTE)
+		ret = process_remote_read(msg);
+	else if (mfs_mode == MFS_MODE_LOCAL)
+		ret = process_local_read(msg);
+	else
+		ret = -EINVAL;
+	return ret;
+}
+
+static int process_req(int fd)
+{
+	char buf[1024];
+	struct mfs_msg *msg;
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = read(fd, buf, sizeof(buf));
+	if (ret <= 0) {
+		if (ret < 0)
+			pr_err("read failed, ret:%d\n", ret);
+		return -1;
+	}
+
+	msg = (void *)buf;
+	if (ret != msg->len) {
+		pr_err("invalid message length, read:%d, need:%d\n", ret, msg->len);
+		return -1;
+	}
+	if (msg->opcode == MFS_OP_READ || msg->opcode == MFS_OP_FAULT ||
+	    msg->opcode == MFS_OP_FAROUND) {
+		return process_read(msg);
+	}
+	pr_err("invalid opcode:%d\n", msg->opcode);
+	return -1;
+}
+
+static void ioctl_mfs_mode(int fd)
+{
+	struct mfs_ioc_fsinfo fsinfo = {0};
+	int ret;
+
+	ret = ioctl(fd, MFS_IOC_FSINFO, (unsigned long)&fsinfo);
+	if (ret < 0) {
+		perror("failed to ioctl mfs_ioc_fsinfo");
+		close(fd);
+		exit(-1);
+	}
+
+	mfs_mode = fsinfo.mode;
+}
+
+int main(int argc, char *argv[])
+{
+	struct pollfd pfd;
+	struct statfs buf;
+	char *mountpoint;
+	char devname[10];
+	int fd, ret;
+
+	if (argc != 2) {
+		printf("./mfsd ${mfs_mountpoint}\n");
+		return -1;
+	}
+	mountpoint = argv[1];
+
+	ret = statfs(mountpoint, &buf);
+	if (ret) {
+		pr_err("statfs %s failed\n", mountpoint);
+		return -1;
+	}
+	if (buf.f_type != MFS_SUPER_MAGIC) {
+		pr_err("fstype(%lx) is invalid, please check the mountpoint\n", buf.f_type);
+		return -1;
+	}
+
+	sprintf(devname, "/dev/mfs%ld", buf.f_spare[0]);
+	fd = open(devname, O_RDWR);
+	if (fd < 0) {
+		pr_err("open %s failed\n", devname);
+		return -1;
+	}
+
+	ioctl_mfs_mode(fd);
+	pfd.fd = fd;
+	pfd.events = POLLIN;
+
+	while (1) {
+		ret = poll(&pfd, 1, -1);
+		if (ret < 0) {
+			pr_err("poll failed\n");
+			return -1;
+		}
+
+		if (ret == 0 || !(pfd.revents & POLLIN)) {
+			pr_err("poll event error, ret:%d, revents:%x\n", ret, pfd.revents);
+			continue;
+		}
+
+		if (process_req(fd) == -1)
+			pr_err("process req failed, errcode:%d\n", errno);
+	}
+	return 0;
+}