diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index 093cdaefdb3733ecd076b5ffbfe3ad880b2c3afd..edea4656c5c05f3da3f6ef46a8e8faf2b2a3f1a1 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
 
  - RMW operations that have a return value are fully ordered.
 
- - RMW operations that are conditional are unordered on FAILURE,
-   otherwise the above rules apply. In the case of test_and_{}_bit() operations,
-   if the bit in memory is unchanged by the operation then it is deemed to have
-   failed.
+ - RMW operations that are conditional are fully ordered.
 
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
 
 Since a platform only has a single means of achieving atomic operations
 the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index bf145171c2bf8f89095dcf8d316ad7d14a6a16e2..693d7a32845a68057d591668f61ec44c3cccf18e 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
    immutable and bit-for-bit identical to the official golden image for
    their releases due to security and other considerations and
 
- - hope to save some extra storage space with guaranteed end-to-end performance
-   by using reduced metadata and transparent file compression, especially
-   for those embedded devices with limited memory (ex, smartphone);
+ - hope to minimize extra storage space with guaranteed end-to-end performance
+   by using compact layout, transparent file compression and direct access,
+   especially for those embedded devices with limited memory and high-density
+   hosts with numerous containers;
 
 Here is the main features of EROFS:
 
@@ -51,7 +52,9 @@ Here is the main features of EROFS:
  - Support POSIX.1e ACLs by using xattrs;
 
  - Support transparent file compression as an option:
-   LZ4 algorithm with 4 KB fixed-sized output compression for high performance.
+   LZ4 algorithm with 4 KB fixed-sized output compression for high performance;
+
+ - Multiple device support for multi-layer container images.
 
 The following git tree provides the file system user-space tools under
 development (ex, formatting tool mkfs.erofs):
@@ -84,6 +87,7 @@ cache_strategy=%s      Select a strategy for cached decompression from now on:
                                    It still does in-place I/O decompression
                                    for the rest compressed physical clusters.
 		       ==========  =============================================
+device=%s              Specify a path to an extra device to be used together.
 ===================    =========================================================
 
 On-disk details
@@ -153,13 +157,14 @@ may not. All metadatas can be now observed in two different spaces (views):
 
     Xattrs, extents, data inline are followed by the corresponding inode with
     proper alignment, and they could be optional for different data mappings.
-    _currently_ total 4 valid data mappings are supported:
+    _currently_ total 5 data layouts are supported:
 
     ==  ====================================================================
      0  flat file data without data inline (no extent);
      1  fixed-sized output data compression (with non-compacted indexes);
      2  flat file data with tail packing data inline (no extent);
-     3  fixed-sized output data compression (with compacted indexes, v5.3+).
+     3  fixed-sized output data compression (with compacted indexes, v5.3+);
+     4  chunk-based file (v5.15+).
     ==  ====================================================================
 
     The size of the optional xattrs is indicated by i_xattr_count in inode
@@ -211,6 +216,17 @@ Note that apart from the offset of the first filename, nameoff0 also indicates
 the total number of directory entries in this block since it is no need to
 introduce another on-disk field at all.
 
+Chunk-based file
+----------------
+In order to support chunk-based data deduplication, a new inode data layout has
+been supported since Linux v5.15: Files are split in equal-sized data chunks
+with ``extents`` area of the inode metadata indicating how to get the chunk
+data: these can be simply as a 4-byte block address array or in the 8-byte
+chunk index form (see struct erofs_inode_chunk_index in erofs_fs.h for more
+details.)
+
+By the way, chunk-based files are all uncompressed for now.
+
 Compression
 -----------
 Currently, EROFS supports 4KB fixed-sized output transparent file compression,
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 4c117f2f55f116f41b8ad793d8adc1631a2768e2..9996b88ba58c5f29fc90639a0c174e69d3709e51 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6407,6 +6407,7 @@ CONFIG_FSCACHE_STATS=y
 CONFIG_CACHEFILES=m
 # CONFIG_CACHEFILES_DEBUG is not set
 # CONFIG_CACHEFILES_HISTOGRAM is not set
+CONFIG_CACHEFILES_ONDEMAND=y
 # end of Caches
 
 #
@@ -6520,7 +6521,13 @@ CONFIG_PSTORE_COMPRESS_DEFAULT="deflate"
 CONFIG_PSTORE_RAM=m
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
-# CONFIG_EROFS_FS is not set
+CONFIG_EROFS_FS=m
+# CONFIG_EROFS_FS_DEBUG is not set
+CONFIG_EROFS_FS_XATTR=y
+CONFIG_EROFS_FS_POSIX_ACL=y
+CONFIG_EROFS_FS_SECURITY=y
+# CONFIG_EROFS_FS_ZIP is not set
+CONFIG_EROFS_FS_ONDEMAND=y
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 CONFIG_NFS_V2=m
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 431e208a5ea4c6e5fa9f680650f54cfaf6520b00..caf8099bd95548ea1281f1e9ef8966d05b20423c 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -241,6 +241,13 @@ static inline void arch___clear_bit_unlock(unsigned long nr,
 	arch___clear_bit(nr, ptr);
 }
 
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
 #include <asm-generic/bitops/instrumented-atomic.h>
 #include <asm-generic/bitops/instrumented-non-atomic.h>
 #include <asm-generic/bitops/instrumented-lock.h>
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index bfaadb4b298f7e23dd84b9f31b44f47e25d5ce92..d5282a39d8cce54d28d93877a6f8b3296fa6f7c9 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -7462,6 +7462,7 @@ CONFIG_FSCACHE_STATS=y
 CONFIG_CACHEFILES=m
 # CONFIG_CACHEFILES_DEBUG is not set
 # CONFIG_CACHEFILES_HISTOGRAM is not set
+CONFIG_CACHEFILES_ONDEMAND=y
 # end of Caches
 
 #
@@ -7577,7 +7578,13 @@ CONFIG_PSTORE_COMPRESS_DEFAULT="deflate"
 CONFIG_PSTORE_RAM=m
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
-# CONFIG_EROFS_FS is not set
+CONFIG_EROFS_FS=m
+# CONFIG_EROFS_FS_DEBUG is not set
+CONFIG_EROFS_FS_XATTR=y
+CONFIG_EROFS_FS_POSIX_ACL=y
+CONFIG_EROFS_FS_SECURITY=y
+# CONFIG_EROFS_FS_ZIP is not set
+CONFIG_EROFS_FS_ONDEMAND=y
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 # CONFIG_NFS_V2 is not set
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0367efdc5b7a8eaf724949c840c6ec3f8ceaf643..c14e861f9956f6ddad0ed9f689ab1f617d79ab1b 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+	bool oldbit;
+
+	asm volatile("testb %2,%1"
+		     CC_SET(nz)
+		     : CC_OUT(nz) (oldbit)
+		     : "m" (((unsigned char *)addr)[nr >> 3]),
+		       "i" (1 << (nr & 7))
+		     : "memory");
+
+	return oldbit;
+}
+
 static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	bool oldbit;
@@ -224,6 +238,13 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 	 ? constant_test_bit((nr), (addr))	\
 	 : variable_test_bit((nr), (addr)))
 
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+					  variable_test_bit(nr, addr);
+}
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
diff --git a/fs/Makefile b/fs/Makefile
index 29cc13ba2c088e6f60c7d486311cb8f4d6d3e89b..06f000e22e06d432dda331d48abff43210513b17 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,6 +15,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o remap_range.o
+obj-y +=	fs_ctl.o
+
 ifdef CONFIG_CC_IS_CLANG
 CFLAGS_namei.o := $(call cc-disable-warning, bitwise-instead-of-logical)
 endif
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index ff9ca55a9ae9c83c6c5503e33fbff1d468409fb9..12174e2616f8f51822fc2a3d0365859fd5147402 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -38,3 +38,15 @@ config CACHEFILES_HISTOGRAM
 
 	  See Documentation/filesystems/caching/cachefiles.rst for more
 	  information.
+
+config CACHEFILES_ONDEMAND
+	bool "Support for on-demand read"
+	depends on CACHEFILES
+	default n
+	help
+	  This permits userspace to enable the cachefiles on-demand read mode.
+	  In this mode, when a cache miss occurs, responsibility for fetching
+	  the data lies with the cachefiles backend instead of with the netfs
+	  and is delegated to userspace.
+
+	  If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda59054ef42f64ee622a04e20c8f5715f2..c247d8b5e4f9085edc5d5c414510b80e2f58e4eb 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -15,5 +15,6 @@ cachefiles-y := \
 	xattr.o
 
 cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
 
 obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 3b39552c23651979275ad624e8aebd5caab740c8..3a88bef9ed4b4f77a8d1f6e1ad91d9caf9a55f78 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -46,11 +46,6 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 	       cache->bcull_percent < cache->brun_percent &&
 	       cache->brun_percent  < 100);
 
-	if (*args) {
-		pr_err("'bind' command doesn't take an argument\n");
-		return -EINVAL;
-	}
-
 	if (!cache->rootdirname) {
 		pr_err("No cache directory specified\n");
 		return -EINVAL;
@@ -62,6 +57,22 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 		return -EBUSY;
 	}
 
+	if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) {
+		if (!strcmp(args, "ondemand")) {
+			if (!cachefiles_ondemand_is_enabled()) {
+				pr_err("ondemand mode is disabled\n");
+				return -EINVAL;
+			}
+			set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+		} else if (*args) {
+			pr_err("Invalid argument to the 'bind' command\n");
+			return -EINVAL;
+		}
+	} else if (*args) {
+		pr_err("'bind' command doesn't take an argument\n");
+		return -EINVAL;
+	}
+
 	/* make sure we have copies of the tag and dirname strings */
 	if (!cache->tag) {
 		/* the tag string is released by the fops->release()
@@ -221,6 +232,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 	if (ret < 0)
 		goto error_add_cache;
 
+	/*
+	 * As the cache->daemon_mutex lock hold and the cache is set to
+	 * CACHEFILES_READY, this function must not return an error.
+	 */
+	cachefiles_mark_object_active(cache, fsdef);
+
 	/* done */
 	set_bit(CACHEFILES_READY, &cache->flags);
 	dput(root);
@@ -235,6 +252,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 error_add_cache:
 	dput(cache->graveyard);
 	cache->graveyard = NULL;
+	fscache_object_destroy(&fsdef->fscache);
 error_unsupported:
 	mntput(cache->mnt);
 	cache->mnt = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 752c1e43416f507aa7fcb4969fa025762186dfbc..3128cbc733eaf6520f3a21a4d1bbb440a32d3bf3 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -73,6 +73,10 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
 	{ "inuse",	cachefiles_daemon_inuse		},
 	{ "secctx",	cachefiles_daemon_secctx	},
 	{ "tag",	cachefiles_daemon_tag		},
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+	{ "copen",	cachefiles_ondemand_copen	},
+	{ "restore",	cachefiles_ondemand_restore	},
+#endif
 	{ "",		NULL				}
 };
 
@@ -105,6 +109,10 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	cache->active_nodes = RB_ROOT;
 	rwlock_init(&cache->active_lock);
 	init_waitqueue_head(&cache->daemon_pollwq);
+	refcount_set(&cache->unbind_pincount, 1);
+
+	INIT_RADIX_TREE(&cache->reqs, GFP_ATOMIC);
+	idr_init(&cache->ondemand_ids);
 
 	/* set default caching limits
 	 * - limit at 1% free space and/or free files
@@ -123,6 +131,60 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+	void **slot;
+	struct radix_tree_iter iter;
+	struct cachefiles_req *req;
+
+	/*
+	 * Make sure the following two operations won't be reordered.
+	 *   1) set CACHEFILES_DEAD bit
+	 *   2) flush requests in the xarray
+	 * Otherwise the request may be enqueued after xarray has been
+	 * flushed, leaving the orphan request never being completed.
+	 *
+	 * CPU 1			CPU 2
+	 * =====			=====
+	 * flush requests in the xarray
+	 *				test CACHEFILES_DEAD bit
+	 *				enqueue the request
+	 * set CACHEFILES_DEAD bit
+	 */
+	smp_mb();
+
+	xa_lock(&cache->reqs);
+	radix_tree_for_each_slot(slot, &cache->reqs, &iter, 0) {
+		req = radix_tree_deref_slot_protected(slot,
+						      &cache->reqs.xa_lock);
+		if (WARN_ON(!req))
+			continue;
+		radix_tree_delete(&cache->reqs, iter.index);
+		req->error = -EIO;
+		complete(&req->done);
+	}
+	xa_unlock(&cache->reqs);
+
+	xa_lock(&cache->ondemand_ids.idr_rt);
+	idr_destroy(&cache->ondemand_ids);
+	xa_unlock(&cache->ondemand_ids.idr_rt);
+}
+
+void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache)
+{
+	if (refcount_dec_and_test(&cache->unbind_pincount)) {
+		cachefiles_daemon_unbind(cache);
+		ASSERT(!cache->active_nodes.rb_node);
+		cachefiles_open = 0;
+		kfree(cache);
+	}
+}
+
+void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache)
+{
+	refcount_inc(&cache->unbind_pincount);
+}
+
 /*
  * release a cache
  */
@@ -136,38 +198,26 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
 
 	set_bit(CACHEFILES_DEAD, &cache->flags);
 
-	cachefiles_daemon_unbind(cache);
-
-	ASSERT(!cache->active_nodes.rb_node);
+	if (cachefiles_in_ondemand_mode(cache))
+		cachefiles_flush_reqs(cache);
 
 	/* clean up the control file interface */
 	cache->cachefilesd = NULL;
 	file->private_data = NULL;
-	cachefiles_open = 0;
-
-	kfree(cache);
 
+	cachefiles_put_unbind_pincount(cache);
 	_leave("");
 	return 0;
 }
 
-/*
- * read the cache state
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
-				      size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+		char __user *_buffer, size_t buflen, loff_t *pos)
 {
-	struct cachefiles_cache *cache = file->private_data;
 	unsigned long long b_released;
 	unsigned f_released;
 	char buffer[256];
 	int n;
 
-	//_enter(",,%zu,", buflen);
-
-	if (!test_bit(CACHEFILES_READY, &cache->flags))
-		return 0;
-
 	/* check how much space the cache has */
 	cachefiles_has_space(cache, 0, 0);
 
@@ -205,6 +255,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
 	return n;
 }
 
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file,
+		char __user *_buffer, size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	if (cachefiles_in_ondemand_mode(cache))
+		return cachefiles_ondemand_daemon_read(cache, _buffer, buflen, pos);
+	else
+		return cachefiles_do_daemon_read(cache, _buffer, buflen, pos);
+}
+
 /*
  * command the cache
  */
@@ -291,13 +360,32 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
 					   struct poll_table_struct *poll)
 {
 	struct cachefiles_cache *cache = file->private_data;
+	struct cachefiles_req *req;
+	struct radix_tree_iter iter;
 	__poll_t mask;
+	void **slot;
 
 	poll_wait(file, &cache->daemon_pollwq, poll);
 	mask = 0;
 
-	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
-		mask |= EPOLLIN;
+	if (cachefiles_in_ondemand_mode(cache)) {
+		if (!radix_tree_empty(&cache->reqs)) {
+			xa_lock(&cache->reqs);
+			radix_tree_for_each_tagged(slot, &cache->reqs, &iter, 0,
+					CACHEFILES_REQ_NEW) {
+				req = radix_tree_deref_slot_protected(slot,
+						&cache->reqs.xa_lock);
+				if (!cachefiles_ondemand_is_reopening_read(req)) {
+					mask |= EPOLLIN;
+					break;
+				}
+			}
+			xa_unlock(&cache->reqs);
+		}
+	} else {
+		if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+			mask |= EPOLLIN;
+	}
 
 	if (test_bit(CACHEFILES_CULLING, &cache->flags))
 		mask |= EPOLLOUT;
@@ -506,7 +594,7 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 
 	if (cache->secctx) {
 		pr_err("Second security context specified\n");
-		return -EINVAL;
+		return -EEXIST;
 	}
 
 	secctx = kstrdup(args, GFP_KERNEL);
@@ -574,6 +662,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 	if (!d_can_lookup(path.dentry))
 		goto notdir;
 
+	/* limit the scope of cull */
+	if (cache->mnt != path.mnt) {
+		path_put(&path);
+		return -EOPNOTSUPP;
+	}
+
 	cachefiles_begin_secure(cache, &saved_cred);
 	ret = cachefiles_cull(cache, path.dentry, args);
 	cachefiles_end_secure(cache, saved_cred);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 4cea5fbf695ea349e9c16da21afeedcd27f32914..fa131db764c48db5cfa604ffeaad35fd8855246d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -51,6 +51,9 @@ static struct fscache_object *cachefiles_alloc_object(
 
 	fscache_object_init(&object->fscache, cookie, &cache->cache);
 
+	if (cachefiles_ondemand_init_obj_info(object))
+		goto nomem_obj_info;
+
 	object->type = cookie->def->type;
 
 	/* get hold of the raw key
@@ -74,7 +77,7 @@ static struct fscache_object *cachefiles_alloc_object(
 	((char *)buffer)[keylen + 4] = 0;
 
 	/* turn the raw key into something that can work with as a filename */
-	key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+	key = cachefiles_cook_key(object, buffer, keylen + 2);
 	if (!key)
 		goto nomem_key;
 
@@ -102,7 +105,11 @@ static struct fscache_object *cachefiles_alloc_object(
 nomem_key:
 	kfree(buffer);
 nomem_buffer:
+	kfree(object->private);
+	object->private = NULL;
+nomem_obj_info:
 	BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+	fscache_object_destroy(&object->fscache);
 	kmem_cache_free(cachefiles_object_jar, object);
 	fscache_object_destroyed(&cache->cache);
 nomem_object:
@@ -264,6 +271,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 	struct cachefiles_cache *cache;
 	const struct cred *saved_cred;
 	struct inode *inode;
+	struct file *file;
 	blkcnt_t i_blocks = 0;
 
 	ASSERT(_object);
@@ -280,6 +288,13 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
 #endif
 
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags) &&
+	    (volume_new_version(object->fscache.cookie) ||
+	     data_new_version(object->fscache.cookie)))
+		cachefiles_mark_object_inactive(cache, object, 0);
+
+	cachefiles_ondemand_clean_object(object);
+
 	/* We need to tidy the object up if we did in fact manage to open it.
 	 * It's possible for us to get here before the object is fully
 	 * initialised if the parent goes away or the object gets retired
@@ -306,6 +321,13 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 		object->backer = NULL;
 	}
 
+	/* clean up file descriptor for non-index object */
+	file = rcu_dereference_protected(object->file, true);
+	if (file) {
+		fput(file);
+		rcu_assign_pointer(object->file, NULL);
+	}
+
 	/* note that the object is now inactive */
 	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
 		cachefiles_mark_object_inactive(cache, object, i_blocks);
@@ -337,8 +359,11 @@ static void cachefiles_put_object(struct fscache_object *_object,
 	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
 #endif
 
-	ASSERTIFCMP(object->fscache.parent,
-		    object->fscache.parent->n_children, >, 0);
+	if (!cachefiles_in_ondemand_mode(container_of(object->fscache.cache,
+					struct cachefiles_cache, cache))) {
+		ASSERTIFCMP(object->fscache.parent,
+			    object->fscache.parent->n_children, >, 0);
+	}
 
 	u = atomic_dec_return(&object->usage);
 	trace_cachefiles_ref(object, _object->cookie,
@@ -362,6 +387,8 @@ static void cachefiles_put_object(struct fscache_object *_object,
 		}
 
 		cache = object->fscache.cache;
+		kfree(object->private);
+		object->private = NULL;
 		fscache_object_destroy(&object->fscache);
 		kmem_cache_free(cachefiles_object_jar, object);
 		fscache_object_destroyed(cache);
@@ -562,6 +589,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
 	.attr_changed		= cachefiles_attr_changed,
 	.read_or_alloc_page	= cachefiles_read_or_alloc_page,
 	.read_or_alloc_pages	= cachefiles_read_or_alloc_pages,
+	.prepare_read		= cachefiles_prepare_read,
 	.allocate_page		= cachefiles_allocate_page,
 	.allocate_pages		= cachefiles_allocate_pages,
 	.write_page		= cachefiles_write_page,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d0c5b71c247c87da40bd41fe825fe..97c4b4c639b4a43ed594feaeb8b4b5ed712ec45e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -18,6 +18,8 @@
 #include <linux/cred.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/cachefiles.h>
+#include <linux/idr.h>
 
 struct cachefiles_cache;
 struct cachefiles_object;
@@ -29,6 +31,21 @@ extern unsigned cachefiles_debug;
 
 #define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
 
+enum cachefiles_object_state {
+	CACHEFILES_ONDEMAND_OBJSTATE_close, /* Anonymous fd closed by daemon or initial state */
+	CACHEFILES_ONDEMAND_OBJSTATE_open, /* Anonymous fd associated with object is available */
+	CACHEFILES_ONDEMAND_OBJSTATE_reopening, /* Object that was closed and is being reopened. */
+	CACHEFILES_ONDEMAND_OBJSTATE_dropping, /* Object is being dropped. */
+};
+
+struct cachefiles_ondemand_info {
+	struct work_struct		work;
+	int				ondemand_id;
+	enum cachefiles_object_state	state;
+	struct cachefiles_object	*object;
+	spinlock_t			lock;
+};
+
 /*
  * node records
  */
@@ -37,6 +54,7 @@ struct cachefiles_object {
 	struct cachefiles_lookup_data	*lookup_data;	/* cached lookup data */
 	struct dentry			*dentry;	/* the file/dir representing this object */
 	struct dentry			*backer;	/* backing file */
+	struct file __rcu		*file;		/* backing file in on-demand mode */
 	loff_t				i_size;		/* object size */
 	unsigned long			flags;
 #define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
@@ -45,10 +63,13 @@ struct cachefiles_object {
 	uint8_t				new;		/* T if object new */
 	spinlock_t			work_lock;
 	struct rb_node			active_node;	/* link in active tree (dentry is key) */
+	struct cachefiles_ondemand_info	*private;
 };
 
 extern struct kmem_cache *cachefiles_object_jar;
 
+#define CACHEFILES_ONDEMAND_ID_CLOSED	-1
+
 /*
  * Cache files cache definition
  */
@@ -84,11 +105,33 @@ struct cachefiles_cache {
 #define CACHEFILES_DEAD			1	/* T if cache dead */
 #define CACHEFILES_CULLING		2	/* T if cull engaged */
 #define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE	4	/* T if in on-demand read mode */
 	char				*rootdirname;	/* name of cache root directory */
 	char				*secctx;	/* LSM security context */
 	char				*tag;		/* cache binding tag */
+	refcount_t			unbind_pincount;/* refcount to do daemon unbind */
+	struct radix_tree_root		reqs;		/* xarray of pending on-demand requests */
+	unsigned long			req_id_next;
+	struct idr			ondemand_ids;	/* xarray for ondemand_id allocation */
+	u32				ondemand_id_next;
+};
+
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+	return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+		test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
+
+struct cachefiles_req {
+	struct cachefiles_object *object;
+	struct completion done;
+	refcount_t ref;
+	int error;
+	struct cachefiles_msg msg;
 };
 
+#define CACHEFILES_REQ_NEW	0
+
 /*
  * backing file read tracking
  */
@@ -98,6 +141,8 @@ struct cachefiles_one_read {
 	struct page			*netfs_page;	/* netfs page we're going to fill */
 	struct fscache_retrieval	*op;		/* retrieval op covering this */
 	struct list_head		op_link;	/* link in op's todo list */
+	unsigned long			flags;
+#define CACHEFILES_MONITOR_ENTER_READ	0       /* restrict calls to read_page */
 };
 
 /*
@@ -141,6 +186,9 @@ extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
  * daemon.c
  */
 extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_flush_reqs(struct cachefiles_cache *cache);
+extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
+extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
 
 extern int cachefiles_has_space(struct cachefiles_cache *cache,
 				unsigned fnr, unsigned bnr);
@@ -153,7 +201,8 @@ extern const struct fscache_cache_ops cachefiles_cache_ops;
 /*
  * key.c
  */
-extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+extern char *cachefiles_cook_key(struct cachefiles_object *object,
+				 const u8 *raw, int keylen);
 
 /*
  * namei.c
@@ -161,6 +210,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
 extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
 					    struct cachefiles_object *object,
 					    blkcnt_t i_blocks);
+extern int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					 struct cachefiles_object *object);
 extern int cachefiles_delete_object(struct cachefiles_cache *cache,
 				    struct cachefiles_object *object);
 extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
@@ -210,6 +261,7 @@ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
 extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
 					  struct list_head *, unsigned *,
 					  gfp_t);
+extern int cachefiles_prepare_read(struct fscache_retrieval *op, pgoff_t index);
 extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
 				    gfp_t);
 extern int cachefiles_allocate_pages(struct fscache_retrieval *,
@@ -217,6 +269,82 @@ extern int cachefiles_allocate_pages(struct fscache_retrieval *,
 extern int cachefiles_write_page(struct fscache_storage *, struct page *);
 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
 
+/*
+ * ondemand.c
+ */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+				char __user *_buffer, size_t buflen, loff_t *pos);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+				     char *args);
+
+extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache,
+					char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+			     loff_t pos, size_t len);
+
+extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object);
+
+#define CACHEFILES_OBJECT_STATE_FUNCS(_state)	\
+static inline bool								\
+cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \
+{												\
+	return object->private->state == CACHEFILES_ONDEMAND_OBJSTATE_##_state; \
+}												\
+												\
+static inline void								\
+cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
+{												\
+	object->private->state = CACHEFILES_ONDEMAND_OBJSTATE_##_state; \
+}
+
+CACHEFILES_OBJECT_STATE_FUNCS(open);
+CACHEFILES_OBJECT_STATE_FUNCS(close);
+CACHEFILES_OBJECT_STATE_FUNCS(reopening);
+CACHEFILES_OBJECT_STATE_FUNCS(dropping);
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+	return cachefiles_ondemand_object_is_reopening(req->object) &&
+			req->msg.opcode == CACHEFILES_OP_READ;
+}
+
+#else
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+				char __user *_buffer, size_t buflen, loff_t *pos)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	return 0;
+}
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+					   loff_t pos, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object)
+{
+	return 0;
+}
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+	return false;
+}
+#endif
+
 /*
  * security.c
  */
@@ -261,6 +389,8 @@ do {							\
 	pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__);	\
 	fscache_io_error(&(___cache)->cache);		\
 	set_bit(CACHEFILES_DEAD, &(___cache)->flags);	\
+	if (cachefiles_in_ondemand_mode(___cache))	\
+		cachefiles_flush_reqs(___cache);	\
 } while (0)
 
 #define cachefiles_io_error_obj(object, FMT, ...)			\
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index be96f5fc5cacaf2589fa304a33fc8f55e957082b..d1adeb58f35c61625dfded41e5e391cde8af0146 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -22,6 +22,118 @@ static const char cachefiles_filecharmap[256] = {
 	[48 ... 127] = 1,		/* '0' -> '~' */
 };
 
+static inline unsigned int how_many_hex_digits(unsigned int x)
+{
+	return x ? round_up(ilog2(x) + 1, 4) / 4 : 0;
+}
+
+static void cachefiles_cook_acc(char *key, unsigned int acc, int *len)
+{
+	key[*len + 1] = cachefiles_charmap[acc & 63];
+	acc >>= 6;
+	key[*len] = cachefiles_charmap[acc & 63];
+	*len += 2;
+}
+
+static int cachefiles_cook_csum(struct fscache_cookie *cookie, const u8 *raw,
+				int keylen, char *key)
+{
+	unsigned char csum = 0;
+	int loop;
+
+	if (volume_new_version(cookie))
+		return 1;
+
+	if (data_new_version(cookie)) {
+		csum = (u8)cookie->key_hash;
+	} else {
+		for (loop = 0; loop < keylen; loop++)
+			csum += raw[loop];
+	}
+	sprintf(key, "@%02x%c+", (unsigned int) csum, 0);
+
+	return 5;
+}
+
+static char *cachefiles_cook_data_key(const u8 *key, int keylen)
+{
+	const u8 *kend;
+	unsigned int acc, i, n, nle, nbe;
+	unsigned int b64len, len, pad;
+	char *name, sep;
+
+	/* See if it makes sense to encode it as "hex,hex,hex" for each 32-bit
+	 * chunk.  We rely on the key having been padded out to a whole number
+	 * of 32-bit words.
+	 */
+	n = round_up(keylen, 4);
+	nbe = nle = 0;
+	for (i = 0; i < n; i += 4) {
+		u32 be = be32_to_cpu(*(__be32 *)(key + i));
+		u32 le = le32_to_cpu(*(__le32 *)(key + i));
+
+		nbe += 1 + how_many_hex_digits(be);
+		nle += 1 + how_many_hex_digits(le);
+	}
+
+	b64len = DIV_ROUND_UP(keylen, 3);
+	pad = b64len * 3 - keylen;
+	b64len = 2 + b64len * 4; /* Length if we base64-encode it */
+	_debug("len=%u nbe=%u nle=%u b64=%u", keylen, nbe, nle, b64len);
+	if (nbe < b64len || nle < b64len) {
+		unsigned int nlen = min(nbe, nle) + 1;
+
+		name = kmalloc(nlen, GFP_KERNEL);
+		if (!name)
+			return NULL;
+		sep = (nbe <= nle) ? 'S' : 'T'; /* Encoding indicator */
+		len = 0;
+		for (i = 0; i < n; i += 4) {
+			u32 x;
+
+			if (nbe <= nle)
+				x = be32_to_cpu(*(__be32 *)(key + i));
+			else
+				x = le32_to_cpu(*(__le32 *)(key + i));
+			name[len++] = sep;
+			if (x != 0)
+				len += snprintf(name + len, nlen - len, "%x", x);
+			sep = ',';
+		}
+		name[len] = 0;
+		return name;
+	}
+
+	/* We need to base64-encode it */
+	name = kmalloc(b64len + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+
+	name[0] = 'E';
+	name[1] = '0' + pad;
+	len = 2;
+	kend = key + keylen;
+	do {
+		acc  = *key++;
+		if (key < kend) {
+			acc |= *key++ << 8;
+			if (key < kend)
+				acc |= *key++ << 16;
+		}
+
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+		acc >>= 6;
+		name[len++] = cachefiles_charmap[acc & 63];
+	} while (key < kend);
+
+	name[len] = 0;
+	return name;
+}
+
 /*
  * turn the raw key into something cooked
  * - the raw key should include the length in the two bytes at the front
@@ -30,23 +142,23 @@ static const char cachefiles_filecharmap[256] = {
  *     cooked
  *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
  */
-char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+char *cachefiles_cook_key(struct cachefiles_object *object,
+			  const u8 *raw, int keylen)
 {
-	unsigned char csum, ch;
 	unsigned int acc;
 	char *key;
 	int loop, len, max, seg, mark, print;
+	uint8_t type = object->type;
+	struct fscache_cookie *cookie = object->fscache.cookie;
 
 	_enter(",%d", keylen);
 
 	BUG_ON(keylen < 2 || keylen > 514);
 
-	csum = raw[0] + raw[1];
 	print = 1;
-	for (loop = 2; loop < keylen; loop++) {
-		ch = raw[loop];
-		csum += ch;
-		print &= cachefiles_filecharmap[ch];
+	if (!volume_new_version(cookie)) {
+		for (loop = 2; loop < keylen; loop++)
+			print &= cachefiles_filecharmap[raw[loop]];
 	}
 
 	if (print) {
@@ -58,6 +170,9 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
 				 * is ((514 + 251) / 252) = 3
 				 */
 		max += 1;	/* NUL on end */
+	} else if (data_new_version(cookie)) {
+		max = 5;	/* @checksum/M */
+		max += 1;	/* NUL on end */
 	} else {
 		/* calculate the maximum length of the cooked key */
 		keylen = (keylen + 2) / 3;
@@ -78,22 +193,13 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
 	if (!key)
 		return NULL;
 
-	len = 0;
-
-	/* build the cooked key */
-	sprintf(key, "@%02x%c+", (unsigned) csum, 0);
-	len = 5;
+	len = cachefiles_cook_csum(cookie, raw, keylen, key);
 	mark = len - 1;
 
 	if (print) {
-		acc = *(uint16_t *) raw;
+		if (!volume_new_version(cookie) && !data_new_version(cookie))
+			cachefiles_cook_acc(key, *(uint16_t *) raw, &len);
 		raw += 2;
-
-		key[len + 1] = cachefiles_charmap[acc & 63];
-		acc >>= 6;
-		key[len] = cachefiles_charmap[acc & 63];
-		len += 2;
-
 		seg = 250;
 		for (loop = keylen; loop > 0; loop--) {
 			if (seg <= 0) {
@@ -112,6 +218,29 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
 		case FSCACHE_COOKIE_TYPE_DATAFILE:	type = 'D';	break;
 		default:				type = 'S';	break;
 		}
+	} else if (data_new_version(cookie)) {
+		int nlen;
+		char *name = cachefiles_cook_data_key(raw + 2, keylen - 2);
+		char *new_key;
+
+		if (!name) {
+			kfree(key);
+			return NULL;
+		}
+
+		nlen = max + strlen(name) - 1;
+		new_key = krealloc(key, nlen, GFP_KERNEL);
+		if (!new_key) {
+			kfree(key);
+			kfree(name);
+			return NULL;
+		}
+
+		key = new_key;
+		type = name[0];
+		for (loop = 1; loop < strlen(name); loop++)
+			key[len++] = name[loop];
+		kfree(name);
 	} else {
 		seg = 252;
 		for (loop = keylen; loop > 0; loop--) {
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ecc8ecbbfa5ac5fa66b1389cf3209b2ec2cf80a1..6eeef666c60998fc708726cacd54a0c49ad5cc43 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -133,8 +133,8 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
 /*
  * record the fact that an object is now active
  */
-static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
-					 struct cachefiles_object *object)
+int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+				  struct cachefiles_object *object)
 {
 	struct cachefiles_object *xobject;
 	struct rb_node **_p, *_parent = NULL;
@@ -264,11 +264,9 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
 
 	write_lock(&cache->active_lock);
 	rb_erase(&object->active_node, &cache->active_nodes);
-	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	clear_and_wake_up_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
 	write_unlock(&cache->active_lock);
 
-	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
-
 	/* This object can now be culled, so we need to let the daemon know
 	 * that there is something it can remove if it needs to.
 	 */
@@ -522,6 +520,20 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 		key = NULL;
 
 lookup_again:
+
+	/*
+	 * Process the open request before acquiring the dir inode lock to
+	 * avoid AA deadlocks caused by the daemon acquiring the dir inode
+	 * lock while processing the open request. Although the daemon gets
+	 * an anonymous fd, it can't be used until object->file has been
+	 * assigned a value.
+	 */
+	if (!key) {
+		ret = cachefiles_ondemand_init_object(object);
+		if (ret < 0)
+			goto error_out2;
+	}
+
 	/* search the current directory for the element name */
 	_debug("lookup '%s'", name);
 
@@ -695,6 +707,29 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 			if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
 				goto check_error;
 
+			if (cachefiles_in_ondemand_mode(cache)) {
+				struct path path;
+				struct file *file;
+
+				path.mnt = cache->mnt;
+				path.dentry = object->dentry;
+				file = dentry_open(&path, O_RDWR | O_LARGEFILE,
+						   cache->cache_cred);
+				if (IS_ERR(file))
+					goto check_error;
+				/*
+				 * so that page_cache_sync_readahead() will fallback
+				 * to force_page_cache_readahead()
+				 */
+				file->f_mode |= FMODE_RANDOM;
+				rcu_assign_pointer(object->file, file);
+
+				/* Now the pages can be read. */
+				if (object->new && object->fscache.store_limit_l)
+					clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET,
+						    &object->fscache.cookie->flags);
+			}
+
 			object->backer = object->dentry;
 		} else {
 			BUG(); // TODO: open file in data-class subdir
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed3b49a4fd4edd54ed708d95262130b1369f430a
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include <linux/module.h>
+#include "internal.h"
+
+struct anon_file {
+	struct file *file;
+	int fd;
+};
+
+static bool cachefiles_buffered_ondemand = true;
+module_param_named(buffered_ondemand, cachefiles_buffered_ondemand, bool, 0644);
+
+static inline void cachefiles_req_put(struct cachefiles_req *req)
+{
+	if (refcount_dec_and_test(&req->ref))
+		kfree(req);
+}
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+					  struct file *file)
+{
+	struct cachefiles_object *object = file->private_data;
+	struct cachefiles_cache *cache;
+	void **slot;
+	struct radix_tree_iter iter;
+	struct cachefiles_ondemand_info *info;
+	int object_id;
+	struct cachefiles_req *req;
+
+	if (!object)
+		return 0;
+
+	info = object->private;
+	object_id = info->ondemand_id;
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	xa_lock(&cache->reqs);
+	spin_lock(&info->lock);
+	info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+	cachefiles_ondemand_set_object_close(object);
+	spin_unlock(&info->lock);
+
+	/* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
+	radix_tree_for_each_tagged(slot, &cache->reqs, &iter, 0, CACHEFILES_REQ_NEW) {
+		req = radix_tree_deref_slot_protected(slot,
+						      &cache->reqs.xa_lock);
+		if (WARN_ON(!req))
+			continue;
+		if (req->msg.object_id == object_id &&
+		    req->msg.opcode == CACHEFILES_OP_CLOSE) {
+			complete(&req->done);
+			radix_tree_iter_delete(&cache->reqs, &iter, slot);
+		}
+	}
+	xa_unlock(&cache->reqs);
+
+	xa_lock(&cache->ondemand_ids.idr_rt);
+	idr_remove(&cache->ondemand_ids, object_id);
+	xa_unlock(&cache->ondemand_ids.idr_rt);
+	object->fscache.cache->ops->put_object(&object->fscache,
+			cachefiles_obj_put_ondemand_fd);
+	cachefiles_put_unbind_pincount(cache);
+	return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+						 struct iov_iter *iter)
+{
+	struct cachefiles_object *object = kiocb->ki_filp->private_data;
+	struct kiocb iocb;
+	struct file *file;
+	ssize_t ret = 0;
+	ssize_t written = 0;
+	size_t bytes;
+
+	rcu_read_lock();
+	file = rcu_dereference(object->file);
+	if (!file || !get_file_rcu(file))
+		file = NULL;
+	rcu_read_unlock();
+
+	if (!file)
+		return -ENOBUFS;
+
+	iocb = (struct kiocb) {
+		.ki_filp   = file,
+		.ki_pos    = kiocb->ki_pos,
+		.ki_flags  = IOCB_WRITE,
+		.ki_ioprio = get_current_ioprio(),
+	};
+
+	if (!cachefiles_buffered_ondemand)
+		iocb.ki_flags |= IOCB_DIRECT;
+retry:
+	bytes = iov_iter_count(iter);
+	if (unlikely(!bytes))
+		goto out;
+
+	ret = iov_iter_fault_in_readable(iter, bytes);
+	if (unlikely(ret))
+		goto out;
+
+	pagefault_disable();
+	ret = vfs_iocb_iter_write(file, &iocb, iter);
+	pagefault_enable();
+	if (ret > 0) {
+		written += ret;
+		goto retry;
+	} else if (ret == -EFAULT) {
+		goto retry;
+	}
+out:
+	fput(file);
+	if (!ret && iov_iter_count(iter))
+		return -EIO;
+	return ret < 0 ? ret : written;
+}
+
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+					 unsigned long arg)
+{
+	struct cachefiles_object *object = filp->private_data;
+	struct cachefiles_cache *cache;
+	struct cachefiles_req *req;
+	unsigned long id;
+
+	if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+		return -EINVAL;
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	id = arg;
+	xa_lock(&cache->reqs);
+	req = radix_tree_lookup(&cache->reqs, id);
+	if (!req || req->msg.opcode != CACHEFILES_OP_READ ||
+	    req->object != object) {
+		xa_unlock(&cache->reqs);
+		return -EINVAL;
+	}
+	radix_tree_delete(&cache->reqs, id);
+	xa_unlock(&cache->reqs);
+
+	complete(&req->done);
+	return 0;
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+	.owner		= THIS_MODULE,
+	.release	= cachefiles_ondemand_fd_release,
+	.write_iter	= cachefiles_ondemand_fd_write_iter,
+	.unlocked_ioctl	= cachefiles_ondemand_fd_ioctl,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ *   <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+	struct cachefiles_req *req;
+	struct fscache_cookie *cookie;
+	struct cachefiles_ondemand_info *info;
+	char *pid, *psize;
+	unsigned long id;
+	long size;
+	int ret;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	if (!*args) {
+		pr_err("Empty id specified\n");
+		return -EINVAL;
+	}
+
+	pid = args;
+	psize = strchr(args, ',');
+	if (!psize) {
+		pr_err("Cache size is not specified\n");
+		return -EINVAL;
+	}
+
+	*psize = 0;
+	psize++;
+
+	ret = kstrtoul(pid, 0, &id);
+	if (ret)
+		return ret;
+
+	xa_lock(&cache->reqs);
+	req = radix_tree_lookup(&cache->reqs, id);
+	if (!req || req->msg.opcode != CACHEFILES_OP_OPEN ||
+	    !req->object->private->ondemand_id) {
+		xa_unlock(&cache->reqs);
+		return -EINVAL;
+	}
+	radix_tree_delete(&cache->reqs, id);
+	xa_unlock(&cache->reqs);
+
+	info = req->object->private;
+	/* fail OPEN request if copen format is invalid */
+	ret = kstrtol(psize, 0, &size);
+	if (ret) {
+		req->error = ret;
+		goto out;
+	}
+
+	/* fail OPEN request if daemon reports an error */
+	if (size < 0) {
+		if (!IS_ERR_VALUE(size)) {
+			req->error = -EINVAL;
+			ret = -EINVAL;
+		} else {
+			req->error = size;
+			ret = 0;
+		}
+		goto out;
+	}
+
+	spin_lock(&info->lock);
+	/* The anonymous fd was closed before copen. */
+	if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) {
+		spin_unlock(&info->lock);
+		req->error = -EBADFD;
+		goto out;
+	}
+	cookie = req->object->fscache.cookie;
+	fscache_set_store_limit(&req->object->fscache, size);
+	if (size)
+		clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+	else
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+	cachefiles_ondemand_set_object_open(req->object);
+	spin_unlock(&info->lock);
+	wake_up_all(&cache->daemon_pollwq);
+
+out:
+	spin_lock(&info->lock);
+	/* Need to set object close to avoid reopen status continuing */
+	if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED)
+		cachefiles_ondemand_set_object_close(req->object);
+	spin_unlock(&info->lock);
+	complete(&req->done);
+	return ret;
+}
+
+int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
+{
+	struct cachefiles_req *req;
+
+	XA_STATE(xas, &cache->reqs, 0);
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Reset the requests to CACHEFILES_REQ_NEW state, so that the
+	 * requests have been processed halfway before the crash of the
+	 * user daemon could be reprocessed after the recovery.
+	 */
+	xas_lock(&xas);
+	xas_for_each(&xas, req, ULONG_MAX)
+		xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+	xas_unlock(&xas);
+
+	wake_up_all(&cache->daemon_pollwq);
+	return 0;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
+				      struct anon_file *anon_file)
+{
+	struct cachefiles_object *object = req->object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_open *load;
+	u32 object_id;
+	int ret;
+
+	object->fscache.cache->ops->grab_object(&object->fscache,
+			cachefiles_obj_get_ondemand_fd);
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+	idr_preload(GFP_KERNEL);
+	xa_lock(&cache->ondemand_ids.idr_rt);
+	ret = idr_alloc_cyclic(&cache->ondemand_ids, NULL,
+			       1, INT_MAX, GFP_ATOMIC);
+	xa_unlock(&cache->ondemand_ids.idr_rt);
+	idr_preload_end();
+	if (ret < 0)
+		goto err;
+	object_id = ret;
+
+	anon_file->fd = get_unused_fd_flags(O_WRONLY);
+	if (anon_file->fd < 0) {
+		ret = anon_file->fd;
+		goto err_free_id;
+	}
+
+	anon_file->file = anon_inode_getfile("[cachefiles]",
+				&cachefiles_ondemand_fd_fops, object, O_WRONLY);
+	if (IS_ERR(anon_file->file)) {
+		ret = PTR_ERR(anon_file->file);
+		goto err_put_fd;
+	}
+
+	spin_lock(&object->private->lock);
+	if (object->private->ondemand_id > 0) {
+		spin_unlock(&object->private->lock);
+		ret = -EEXIST;
+		anon_file->file->private_data = NULL;
+		goto err_put_file;
+	}
+
+	anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+
+	load = (void *)req->msg.data;
+	load->fd = anon_file->fd;
+	object->private->ondemand_id = object_id;
+	spin_unlock(&object->private->lock);
+
+	cachefiles_get_unbind_pincount(cache);
+	return 0;
+
+err_put_file:
+	fput(anon_file->file);
+	anon_file->file = NULL;
+err_put_fd:
+	put_unused_fd(anon_file->fd);
+	anon_file->fd = ret;
+err_free_id:
+	xa_lock(&cache->ondemand_ids.idr_rt);
+	idr_remove(&cache->ondemand_ids, object_id);
+	xa_unlock(&cache->ondemand_ids.idr_rt);
+err:
+	spin_lock(&req->object->private->lock);
+	/* Avoid marking an opened object as closed. */
+	if (object->private->ondemand_id <= 0)
+		cachefiles_ondemand_set_object_close(req->object);
+	spin_unlock(&req->object->private->lock);
+
+	object->fscache.cache->ops->put_object(&object->fscache,
+			cachefiles_obj_put_ondemand_fd);
+	return ret;
+}
+
+static void ondemand_object_worker(struct work_struct *work)
+{
+	struct cachefiles_object *object;
+
+	object = ((struct cachefiles_ondemand_info *)work)->object;
+	cachefiles_ondemand_init_object(object);
+}
+
+/*
+ * Find a request to be handled in the range of [start, end].  If there are any
+ * inflight or subsequent READ requests on the closed object, reopen it.  Skip
+ * read requests whose related object is reopening.
+ */
+static struct cachefiles_req *cachefiles_ondemand_select_req(struct cachefiles_cache *cache,
+							     struct radix_tree_iter *iter,
+							     unsigned long start,
+							     unsigned long end)
+{
+	void **slot;
+	struct cachefiles_req *req;
+	struct cachefiles_ondemand_info *info;
+
+	radix_tree_for_each_tagged(slot, &cache->reqs, iter, start, CACHEFILES_REQ_NEW) {
+		req = radix_tree_deref_slot_protected(slot, &cache->reqs.xa_lock);
+		if (WARN_ON(!req))
+			return NULL;
+		if (iter->index > end)
+			return NULL;
+		if (req->msg.opcode != CACHEFILES_OP_READ)
+			return req;
+		info = req->object->private;
+		if (cachefiles_ondemand_object_is_close(req->object)) {
+			cachefiles_ondemand_set_object_reopening(req->object);
+			queue_work(fscache_object_wq, &info->work);
+			continue;
+		} else if (cachefiles_ondemand_object_is_reopening(req->object)) {
+			continue;
+		}
+		return req;
+	}
+	return NULL;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen, loff_t *pos)
+{
+	struct cachefiles_req *req;
+	struct cachefiles_msg *msg;
+	unsigned long id = 0;
+	size_t n;
+	int ret = 0;
+	struct radix_tree_iter iter;
+	struct anon_file anon_file;
+
+	/*
+	 * Cyclically search for a request that has not ever been processed,
+	 * to prevent requests from being processed repeatedly, and make
+	 * request distribution fair.
+	 */
+	xa_lock(&cache->reqs);
+	req = cachefiles_ondemand_select_req(cache, &iter, cache->req_id_next, ULONG_MAX);
+	if (!req && cache->req_id_next > 0)
+		req = cachefiles_ondemand_select_req(cache, &iter, 0, cache->req_id_next - 1);
+	if (!req) {
+		xa_unlock(&cache->reqs);
+		return 0;
+	}
+
+	msg = &req->msg;
+	n = msg->len;
+
+	if (n > buflen) {
+		xa_unlock(&cache->reqs);
+		return -EMSGSIZE;
+	}
+
+	radix_tree_iter_tag_clear(&cache->reqs, &iter, CACHEFILES_REQ_NEW);
+	cache->req_id_next = iter.index + 1;
+	refcount_inc(&req->ref);
+	req->object->fscache.cache->ops->grab_object(&req->object->fscache,
+			cachefiles_obj_get_read_req);
+	xa_unlock(&cache->reqs);
+
+	id = iter.index;
+
+	if (msg->opcode == CACHEFILES_OP_OPEN) {
+		ret = cachefiles_ondemand_get_fd(req, &anon_file);
+		if (ret)
+			goto out;
+	}
+
+	msg->msg_id = id;
+	msg->object_id = req->object->private->ondemand_id;
+
+	if (copy_to_user(_buffer, msg, n) != 0)
+		ret = -EFAULT;
+
+	if (msg->opcode == CACHEFILES_OP_OPEN) {
+		if (ret < 0) {
+			fput(anon_file.file);
+			put_unused_fd(anon_file.fd);
+			goto out;
+		}
+		fd_install(anon_file.fd, anon_file.file);
+	}
+
+out:
+	req->object->fscache.cache->ops->put_object(&req->object->fscache,
+			cachefiles_obj_put_read_req);
+	/* Remove error request and CLOSE request has no reply */
+	if (ret || msg->opcode == CACHEFILES_OP_CLOSE) {
+		xa_lock(&cache->reqs);
+		if (radix_tree_lookup(&cache->reqs, id) == req) {
+			req->error = ret;
+			complete(&req->done);
+			radix_tree_delete(&cache->reqs, id);
+		}
+		xa_unlock(&cache->reqs);
+	}
+	cachefiles_req_put(req);
+	return ret ? ret : n;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+					enum cachefiles_opcode opcode,
+					size_t data_len,
+					init_req_fn init_req,
+					void *private)
+{
+	static atomic64_t global_index = ATOMIC64_INIT(0);
+	struct cachefiles_cache *cache;
+	struct cachefiles_req *req = NULL;
+	long id;
+	int ret;
+
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return 0;
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+	if (!req) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	refcount_set(&req->ref, 1);
+	req->object = object;
+	init_completion(&req->done);
+	req->msg.opcode = opcode;
+	req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+	ret = init_req(req, private);
+	if (ret)
+		goto out;
+
+	/*
+	 * Stop enqueuing the request when daemon is dying. The
+	 * following two operations need to be atomic as a whole.
+	 *   1) check cache state, and
+	 *   2) enqueue request if cache is alive.
+	 * Otherwise the request may be enqueued after xarray has been
+	 * flushed, leaving the orphan request never being completed.
+	 *
+	 * CPU 1			CPU 2
+	 * =====			=====
+	 *				test CACHEFILES_DEAD bit
+	 * set CACHEFILES_DEAD bit
+	 * flush requests in the xarray
+	 *				enqueue the request
+	 */
+	xa_lock(&cache->reqs);
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags) ||
+	    cachefiles_ondemand_object_is_dropping(object)) {
+		xa_unlock(&cache->reqs);
+		ret = -EIO;
+		goto out;
+	}
+
+	/* coupled with the barrier in cachefiles_flush_reqs() */
+	smp_mb();
+	if (opcode == CACHEFILES_OP_CLOSE &&
+		!cachefiles_ondemand_object_is_open(object)) {
+		WARN_ON_ONCE(object->private->ondemand_id == 0);
+		xa_unlock(&cache->reqs);
+		ret = -EIO;
+		goto out;
+	}
+
+	do {
+		id = atomic64_inc_return(&global_index);
+		if (unlikely(id == UINT_MAX))
+			atomic64_set(&global_index, 0);
+	} while (radix_tree_insert(&cache->reqs, id, req));
+
+	radix_tree_tag_set(&cache->reqs, id, CACHEFILES_REQ_NEW);
+	xa_unlock(&cache->reqs);
+
+	wake_up_all(&cache->daemon_pollwq);
+wait:
+	ret = wait_for_completion_killable(&req->done);
+	if (!ret) {
+		ret = req->error;
+	} else {
+		xa_lock(&cache->reqs);
+		if (radix_tree_lookup(&cache->reqs, id) == req) {
+			radix_tree_delete(&cache->reqs, id);
+			ret = -EINTR;
+		}
+		xa_unlock(&cache->reqs);
+
+		/* Someone will complete it soon. */
+		if (ret != -EINTR) {
+			cpu_relax();
+			goto wait;
+		}
+	}
+	cachefiles_req_put(req);
+	return ret;
+out:
+	/* Reset the object to close state in error handling path.
+	 * If error occurs after creating the anonymous fd,
+	 * cachefiles_ondemand_fd_release() will set object to close.
+	 */
+	if (opcode == CACHEFILES_OP_OPEN &&
+	    !cachefiles_ondemand_object_is_dropping(object))
+		cachefiles_ondemand_set_object_close(object);
+	kfree(req);
+	return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_object *object = req->object;
+	struct fscache_cookie *cookie = object->fscache.cookie;
+	struct fscache_cookie *volume = object->fscache.parent->cookie;
+	struct cachefiles_open *load = (void *)req->msg.data;
+	size_t volume_key_size, cookie_key_size;
+	char *cookie_key, *volume_key;
+
+	/*
+	 * cookie_key is a string without trailing '\0', while cachefiles_open
+	 * expects cookie key a string without trailing '\0'.
+	 */
+	cookie_key_size = cookie->key_len;
+	if (cookie->key_len <= sizeof(cookie->inline_key))
+		cookie_key = cookie->inline_key;
+	else
+		cookie_key = cookie->key;
+
+	/*
+	 * volume_key is a string without trailing '\0', while cachefiles_open
+	 * expects volume key a string with trailing '\0'.
+	 */
+	volume_key_size = volume->key_len + 1;
+	if (volume->key_len <= sizeof(volume->inline_key))
+		volume_key = volume->inline_key;
+	else
+		volume_key = volume->key;
+
+	load->volume_key_size = volume_key_size;
+	load->cookie_key_size = cookie_key_size;
+	memcpy(load->data, volume_key, volume->key_len);
+	load->data[volume_key_size - 1] = '\0';
+	memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+	return 0;
+}
+
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+					      void *private)
+{
+	struct cachefiles_object *object = req->object;
+
+	if (!cachefiles_ondemand_object_is_open(object))
+		return -ENOENT;
+	return 0;
+}
+
+struct cachefiles_read_ctx {
+	loff_t off;
+	size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_read *load = (void *)req->msg.data;
+	struct cachefiles_read_ctx *read_ctx = private;
+
+	load->off = read_ctx->off;
+	load->len = read_ctx->len;
+	return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	struct fscache_cookie *cookie = object->fscache.cookie;
+	size_t volume_key_size, cookie_key_size, data_len;
+
+	if (!object->private)
+		return 0;
+
+	/*
+	 * CacheFiles will firstly check the cache file under the root cache
+	 * directory. If the coherency check failed, it will fallback to
+	 * creating a new tmpfile as the cache file. Reuse the previously
+	 * allocated object ID if any.
+	 */
+	if (cachefiles_ondemand_object_is_open(object) ||
+		object->type == FSCACHE_COOKIE_TYPE_INDEX)
+		return 0;
+
+	volume_key_size = object->fscache.parent->cookie->key_len + 1;
+	cookie_key_size = cookie->key_len;
+	data_len = sizeof(struct cachefiles_open) + volume_key_size + cookie_key_size;
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+			data_len, cachefiles_ondemand_init_open_req, NULL);
+}
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+	void **slot;
+	struct cachefiles_req *req;
+	struct radix_tree_iter iter;
+	struct cachefiles_cache *cache;
+
+	if (!object->private)
+		return;
+
+	cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+			cachefiles_ondemand_init_close_req, NULL);
+
+	if (!object->private->ondemand_id)
+		return;
+
+	/* Flush all requests for the object that is being dropped. */
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+	xa_lock(&cache->reqs);
+	cachefiles_ondemand_set_object_dropping(object);
+	radix_tree_for_each_slot(slot, &cache->reqs, &iter, 0) {
+		req = radix_tree_deref_slot_protected(slot,
+						      &cache->reqs.xa_lock);
+		if (WARN_ON(!req))
+			continue;
+		if (req->object == object) {
+			req->error = -EIO;
+			complete(&req->done);
+			radix_tree_delete(&cache->reqs, iter.index);
+		}
+	}
+	xa_unlock(&cache->reqs);
+
+	/* Wait for ondemand_object_worker() to finish. */
+	cancel_work_sync(&object->private->work);
+}
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+			     loff_t pos, size_t len)
+{
+	struct cachefiles_read_ctx read_ctx = {pos, len};
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+			sizeof(struct cachefiles_read),
+			cachefiles_ondemand_init_read_req, &read_ctx);
+}
+
+int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object)
+{
+	struct cachefiles_cache *cache;
+
+	cache = container_of(object->fscache.cache, struct cachefiles_cache, cache);
+	if (!cachefiles_in_ondemand_mode(cache))
+		return 0;
+
+	object->private = kzalloc(sizeof(struct cachefiles_ondemand_info), GFP_KERNEL);
+	if (!object->private)
+		return -ENOMEM;
+
+	object->private->object = object;
+	spin_lock_init(&object->private->lock);
+	INIT_WORK(&object->private->work, ondemand_object_worker);
+	return 0;
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59442312295536a3ef44733b356f66..d95435ad5542805086c0afd5cebfbeff81db08a3 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
 #include "internal.h"
 
 /*
@@ -106,6 +108,13 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
 	 * need a second */
 	put_page(backpage2);
 
+	/*
+	 * end the process if the page was not truncated
+	 * and we have already read it before
+	 */
+	if (test_bit(CACHEFILES_MONITOR_ENTER_READ, &monitor->flags))
+		return -EIO;
+
 	INIT_LIST_HEAD(&monitor->op_link);
 	add_page_wait_queue(backpage, &monitor->monitor);
 
@@ -118,6 +127,8 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
 			goto unlock_discard;
 
 		_debug("reissue read");
+		if (data_new_version(object->fscache.cookie))
+			set_bit(CACHEFILES_MONITOR_ENTER_READ, &monitor->flags);
 		ret = bmapping->a_ops->readpage(NULL, backpage);
 		if (ret < 0)
 			goto discard;
@@ -188,7 +199,11 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 			error = cachefiles_read_reissue(object, monitor);
 			if (error == -EINPROGRESS)
 				goto next;
-			goto recheck;
+			if (!data_new_version(object->fscache.cookie) || !error)
+				goto recheck;
+			pr_warn("%s, read error: %d, at page %lu, flags: %lx\n",
+				__func__, error, monitor->back_page->index,
+				(unsigned long) monitor->back_page->flags);
 		} else {
 			cachefiles_io_error_obj(
 				object,
@@ -233,12 +248,13 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 	struct cachefiles_one_read *monitor;
 	struct address_space *bmapping;
 	struct page *newpage, *backpage;
+	pgoff_t index = op->offset >> PAGE_SHIFT;
 	int ret;
 
 	_enter("");
 
 	_debug("read back %p{%lu,%d}",
-	       netpage, netpage->index, page_count(netpage));
+	       netpage, index, page_count(netpage));
 
 	monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
 	if (!monitor)
@@ -254,7 +270,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 	newpage = NULL;
 
 	for (;;) {
-		backpage = find_get_page(bmapping, netpage->index);
+		backpage = find_get_page(bmapping, index);
 		if (backpage)
 			goto backing_page_already_present;
 
@@ -265,7 +281,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 		}
 
 		ret = add_to_page_cache_lru(newpage, bmapping,
-					    netpage->index, cachefiles_gfp);
+					    index, cachefiles_gfp);
 		if (ret == 0)
 			goto installed_new_backing_page;
 		if (ret != -EEXIST)
@@ -281,6 +297,8 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 	newpage = NULL;
 
 read_backing_page:
+	if (data_new_version(object->fscache.cookie))
+		set_bit(CACHEFILES_MONITOR_ENTER_READ, &monitor->flags);
 	ret = bmapping->a_ops->readpage(NULL, backpage);
 	if (ret < 0)
 		goto read_error;
@@ -399,6 +417,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	sector_t block;
 	unsigned shift;
 	int ret, ret2;
+	bool again = true;
+	loff_t pos = op->offset;
 
 	object = container_of(op->op.object,
 			      struct cachefiles_object, fscache);
@@ -426,14 +446,15 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	 *   enough for this as it doesn't indicate errors, but it's all we've
 	 *   got for the moment
 	 */
-	block = page->index;
+retry:
+	block = pos >> PAGE_SHIFT;
 	block <<= shift;
 
 	ret2 = bmap(inode, &block);
 	ASSERT(ret2 == 0);
 
 	_debug("%llx -> %llx",
-	       (unsigned long long) (page->index << shift),
+	       (unsigned long long) (pos >> PAGE_SHIFT << shift),
 	       (unsigned long long) block);
 
 	if (block) {
@@ -441,6 +462,13 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 		 * read from disk */
 		ret = cachefiles_read_backing_file_one(object, op, page);
 	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
+		if (cachefiles_in_ondemand_mode(cache) && again) {
+			ret = cachefiles_ondemand_read(object, pos, PAGE_SIZE);
+			if (!ret) {
+				again = false;
+				goto retry;
+			}
+		}
 		/* there's space in the cache we can use */
 		fscache_mark_page_cached(op, page);
 		fscache_retrieval_complete(op, 1);
@@ -782,6 +810,167 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	return -ENOBUFS;
 }
 
+static int cachefiles_ondemand_check(struct cachefiles_object *object,
+		loff_t start_pos, size_t len)
+{
+	struct file *file = rcu_dereference_raw(object->file);
+	size_t remained;
+	loff_t pos;
+	int ret;
+
+	/* make sure there's no hole in the requested range */
+	pos = start_pos;
+	remained = len;
+
+	while (remained) {
+		bool again = true;
+		size_t count = remained;
+		loff_t off, off2, new_pos;
+retry:
+		off = vfs_llseek(file, pos, SEEK_DATA);
+		if (off < 0) {
+			if (off == (loff_t)-ENXIO)
+				goto ondemand_read;
+			return -ENODATA;
+		}
+
+		if (off >= pos + remained)
+			goto ondemand_read;
+
+		if (off > pos) {
+			count = off - pos;
+			goto ondemand_read;
+		}
+
+		off2 = vfs_llseek(file, pos, SEEK_HOLE);
+		if (off2 < 0)
+			return -ENODATA;
+
+		new_pos = min_t(loff_t, off2, pos + remained);
+		remained -= new_pos - pos;
+		pos = new_pos;
+		continue;
+ondemand_read:
+		if (again) {
+			ret = cachefiles_ondemand_read(object, pos, count);
+			if (!ret) {
+				/* recheck if the hole has been filled or not */
+				again = false;
+				goto retry;
+			}
+		}
+		return -ENODATA;
+	}
+	return 0;
+}
+
+struct cachefiles_kiocb {
+	struct kiocb iocb;
+	struct fscache_retrieval *op;
+	struct iov_iter iter;
+	struct work_struct work;
+	struct bio_vec bvs[];
+};
+
+void cachefiles_readpages_work_func(struct work_struct *work)
+{
+	struct cachefiles_kiocb *ki = container_of(work, struct cachefiles_kiocb, work);
+	int ret;
+
+	ret = vfs_iocb_iter_read(ki->iocb.ki_filp, &ki->iocb, &ki->iter);
+	/* complete the request if there's any progress or error occurred */
+	if (ret != -EIOCBQUEUED) {
+		struct fscache_retrieval *op = ki->op;
+		unsigned int nr_pages = atomic_read(&op->n_pages);
+		unsigned int done_pages = 0;
+		int i, error;
+
+		if (ret > 0)
+			done_pages = ret / PAGE_SIZE;
+
+		for (i = 0; i < nr_pages; i++) {
+			error = i < done_pages ? 0 : -EIO;
+			fscache_end_io(op, ki->bvs[i].bv_page, error);
+		}
+
+		fscache_retrieval_complete(op, nr_pages);
+		fscache_put_retrieval(op);
+		kfree(ki);
+	}
+}
+
+int cachefiles_prepare_read(struct fscache_retrieval *op, pgoff_t index)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_kiocb *ki;
+	loff_t start_pos = op->offset;
+	unsigned int n, nr_pages = atomic_read(&op->n_pages);
+	size_t len = nr_pages << PAGE_SHIFT;
+	struct page **pages;
+	struct file *file;
+	size_t size;
+	int i, ret;
+
+	object = container_of(op->op.object, struct cachefiles_object, fscache);
+	if (!object->backer)
+		goto all_enobufs;
+	file = rcu_dereference_raw(object->file);
+
+	/*
+	 * 1. Check if there's hole in the requested range, and trigger an
+	 * on-demand read request if there's any.
+	 */
+	ASSERT(start_pos % PAGE_SIZE == 0);
+	ret = cachefiles_ondemand_check(object, start_pos, len);
+	if (ret)
+		goto all_enobufs;
+
+	/*
+	 * 2. Trigger readahead on the backing file in advance. Since
+	 * FMODE_RANDOM, the following page_cache_sync_readahead() will fallback
+	 * to force_page_cache_readahead().
+	 */
+	page_cache_sync_readahead(d_inode(object->backer)->i_mapping,
+			&file->f_ra, file, start_pos / PAGE_SIZE, nr_pages);
+
+	size = sizeof(struct cachefiles_kiocb) + nr_pages * sizeof(struct bio_vec);
+	ki = kzalloc(size, GFP_KERNEL);
+	if (!ki)
+		goto all_enobufs;
+
+	/* reuse the tailing part of ki as pages[] */
+	pages = (void *)ki + size - nr_pages * sizeof(struct page *);
+	n = find_get_pages_contig(op->mapping, index, nr_pages, pages);
+	if (WARN_ON(n != nr_pages)) {
+		for (i = 0; i < n; i++)
+			put_page(pages[i]);
+		kfree(ki);
+		goto all_enobufs;
+	}
+
+	for (i = 0; i < n; i++) {
+		put_page(pages[i]);
+		ki->bvs[i].bv_page = pages[i];
+		ki->bvs[i].bv_offset = 0;
+		ki->bvs[i].bv_len = PAGE_SIZE;
+	}
+	iov_iter_bvec(&ki->iter, READ, ki->bvs, n, n * PAGE_SIZE);
+
+	ki->iocb.ki_filp	= file;
+	ki->iocb.ki_pos		= start_pos;
+	ki->iocb.ki_ioprio	= get_current_ioprio();
+	ki->op			= fscache_get_retrieval(op);
+
+	/* 3. Start a buffer read in worker context */
+	INIT_WORK(&ki->work, cachefiles_readpages_work_func);
+	queue_work(system_unbound_wq, &ki->work);
+	return 0;
+
+all_enobufs:
+	fscache_retrieval_complete(op, nr_pages);
+	return -ENOBUFS;
+}
+
 /*
  * allocate a block in the cache in which to store a page
  * - cache withdrawal is prevented by the caller
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 72e42438f3d7380f8b52e3055ca6f15da54634e2..4aadace026d27ddadd99e1ffb2e7b8c7c4ad3078 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -18,6 +18,29 @@
 static const char cachefiles_xattr_cache[] =
 	XATTR_USER_PREFIX "CacheFiles.cache";
 
+#define CACHEFILES_COOKIE_TYPE_DATA 1
+#define CACHEFILES_CONTENT_NO_DATA 0 /* No content stored */
+
+struct cachefiles_obj_xattr {
+	__be64	object_size;	/* Actual size of the object */
+	__be64	zero_point;     /* always zero */
+	__u8	type;           /* Type of object */
+	__u8	content;        /* always zero */
+	__u8	data[];         /* netfs coherency data, always NULL */
+} __packed;
+
+struct cachefiles_vol_xattr {
+	__be32  reserved;	/* Reserved, should be 0 */
+	__u8    data[];		/* netfs volume coherency data, NULL */
+} __packed;
+
+struct cachefiles_vol_xattr new_vol_xattr;
+
+static int cachefiles_set_new_vol_xattr(struct cachefiles_object *object);
+static int cachefiles_check_new_vol_xattr(struct cachefiles_object *object);
+static int cachefiles_set_new_obj_xattr(struct cachefiles_object *object);
+static int cachefiles_check_new_obj_xattr(struct cachefiles_object *object);
+
 /*
  * check the type label on an object
  * - done using xattrs
@@ -110,9 +133,14 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
 	_debug("SET #%u", auxdata->len);
 
 	clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_CREATE);
+	if (data_new_version(object->fscache.cookie))
+		ret = cachefiles_set_new_obj_xattr(object);
+	else if (volume_new_version(object->fscache.cookie))
+		ret = cachefiles_set_new_vol_xattr(object);
+	else
+		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+				   &auxdata->type, auxdata->len,
+				   XATTR_CREATE);
 	if (ret < 0 && ret != -ENOMEM)
 		cachefiles_io_error_obj(
 			object,
@@ -190,48 +218,30 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
 	return ret;
 }
 
-/*
- * check the state xattr on a cache file
- * - return -ESTALE if the object should be deleted
- */
-int cachefiles_check_object_xattr(struct cachefiles_object *object,
-				  struct cachefiles_xattr *auxdata)
+int cachefiles_check_old_object_xattr(struct cachefiles_object *object,
+				      struct cachefiles_xattr *auxdata)
 {
 	struct cachefiles_xattr *auxbuf;
+	unsigned int len = sizeof(struct cachefiles_xattr) + 512;
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	_enter("%p,#%d", object, auxdata->len);
-
-	ASSERT(dentry);
-	ASSERT(d_backing_inode(dentry));
-
-	auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
-	if (!auxbuf) {
-		_leave(" = -ENOMEM");
+	auxbuf = kmalloc(len, cachefiles_gfp);
+	if (!auxbuf)
 		return -ENOMEM;
-	}
 
 	/* read the current type label */
 	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
 			   &auxbuf->type, 512 + 1);
-	if (ret < 0) {
-		if (ret == -ENODATA)
-			goto stale; /* no attribute - power went off
-				     * mid-cull? */
-
-		if (ret == -ERANGE)
-			goto bad_type_length;
-
-		cachefiles_io_error_obj(object,
-					"Can't read xattr on %lu (err %d)",
-					d_backing_inode(dentry)->i_ino, -ret);
+	if (ret < 0)
 		goto error;
-	}
 
 	/* check the on-disk object */
-	if (ret < 1)
-		goto bad_type_length;
+	if (ret < 1) {
+		pr_err("Cache object %lu xattr length incorrect\n",
+		       d_backing_inode(dentry)->i_ino);
+		goto stale;
+	}
 
 	if (auxbuf->type != auxdata->type)
 		goto stale;
@@ -287,6 +297,51 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 
 error:
 	kfree(auxbuf);
+	return ret;
+
+stale:
+	ret = -ESTALE;
+	goto error;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+				  struct cachefiles_xattr *auxdata)
+{
+	int ret;
+	struct dentry *dentry = object->dentry;
+
+	_enter("%p,#%d", object, auxdata->len);
+
+	ASSERT(dentry);
+	ASSERT(d_backing_inode(dentry));
+
+	if (data_new_version(object->fscache.cookie))
+		ret = cachefiles_check_new_obj_xattr(object);
+	else if (volume_new_version(object->fscache.cookie))
+		ret = cachefiles_check_new_vol_xattr(object);
+	else
+		ret = cachefiles_check_old_object_xattr(object, auxdata);
+
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ESTALE)
+			goto error;
+		/* no attribute - power went off mid-cull? */
+		if (ret == -ENODATA)
+			goto stale;
+		if (ret == -ERANGE)
+			goto bad_type_length;
+
+		cachefiles_io_error_obj(object,
+					"Can't read xattr on %lu (err %d)",
+					d_backing_inode(dentry)->i_ino, -ret);
+		goto error;
+	}
+	ret = 0;
+error:
 	_leave(" = %d", ret);
 	return ret;
 
@@ -323,3 +378,63 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 	_leave(" = %d", ret);
 	return ret;
 }
+
+static int cachefiles_set_new_vol_xattr(struct cachefiles_object *object)
+{
+	unsigned int len = sizeof(struct cachefiles_vol_xattr);
+	struct dentry *dentry = object->dentry;
+
+	return vfs_setxattr(dentry, cachefiles_xattr_cache, &new_vol_xattr,
+			    len, XATTR_CREATE);
+}
+
+static int cachefiles_check_new_vol_xattr(struct cachefiles_object *object)
+{
+	int ret;
+	struct cachefiles_vol_xattr buf;
+	unsigned int len = sizeof(struct cachefiles_vol_xattr);
+	struct dentry *dentry = object->dentry;
+
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, &buf, len);
+	if (ret < 0)
+		return ret;
+
+	if (ret != len || memcmp(&buf, &new_vol_xattr, len) != 0)
+		ret = -ESTALE;
+
+	return ret > 0 ? 0 : ret;
+}
+
+static int cachefiles_set_new_obj_xattr(struct cachefiles_object *object)
+{
+	unsigned int len = sizeof(struct cachefiles_obj_xattr);
+	struct dentry *dentry = object->dentry;
+	struct cachefiles_obj_xattr buf = {
+		.object_size = cpu_to_be64(object->fscache.store_limit_l),
+		.type	     = CACHEFILES_COOKIE_TYPE_DATA,
+		.content     = CACHEFILES_CONTENT_NO_DATA,
+	};
+
+	return vfs_setxattr(dentry, cachefiles_xattr_cache, &buf, len,
+			    XATTR_CREATE);
+}
+
+static int cachefiles_check_new_obj_xattr(struct cachefiles_object *object)
+{
+	int ret;
+	struct cachefiles_obj_xattr buf;
+	unsigned int len = sizeof(struct cachefiles_obj_xattr);
+	struct dentry *dentry = object->dentry;
+
+	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, &buf, len);
+	if (ret < 0)
+		return ret;
+
+	if (ret != len ||
+	    buf.type != CACHEFILES_COOKIE_TYPE_DATA ||
+	    buf.content != CACHEFILES_CONTENT_NO_DATA ||
+	    buf.object_size != cpu_to_be64(object->fscache.store_limit_l))
+		ret = -ESTALE;
+
+	return ret > 0 ? 0 : ret;
+}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 74b0aaa7114c238788d9e78ad9a0c9439b368f79..f7222cba2a678591515289b9d1c702e3487160a8 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -5,16 +5,22 @@ config EROFS_FS
 	depends on BLOCK
 	select LIBCRC32C
 	help
-	  EROFS (Enhanced Read-Only File System) is a lightweight
-	  read-only file system with modern designs (eg. page-sized
-	  blocks, inline xattrs/data, etc.) for scenarios which need
-	  high-performance read-only requirements, e.g. Android OS
-	  for mobile phones and LIVECDs.
-
-	  It also provides fixed-sized output compression support,
-	  which improves storage density, keeps relatively higher
-	  compression ratios, which is more useful to achieve high
-	  performance for embedded devices with limited memory.
+	  EROFS (Enhanced Read-Only File System) is a lightweight read-only
+	  file system with modern designs (e.g. no buffer heads, inline
+	  xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+	  scenarios which need high-performance read-only solutions, e.g.
+	  smartphones with Android OS, LiveCDs and high-density hosts with
+	  numerous containers;
+
+	  It also provides fixed-sized output compression support in order to
+	  improve storage density as well as keep relatively higher compression
+	  ratios and implements in-place decompression to reuse the file page
+	  for compressed data temporarily with proper strategies, which is
+	  quite useful to ensure guaranteed end-to-end runtime decompression
+	  performance under extremely memory pressure without extra cost.
+
+	  See the documentation at <file:Documentation/filesystems/erofs.rst>
+	  for more details.
 
 	  If unsure, say N.
 
@@ -90,3 +96,12 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  less than 2. Otherwise, the image will be refused
 	  to mount on this kernel.
 
+config EROFS_FS_ONDEMAND
+	bool "EROFS fscache-based on-demand read support"
+	depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+	default n
+	help
+	  This permits EROFS to use fscache-backed data blobs with on-demand
+	  read support.
+
+	  If unsure, say N.
\ No newline at end of file
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 46f2aa4ba46c226e2b85aa2f82aefa508b586a1a..b75be64ffc0be706780df8b45c1e1e1f7a4f2ff0 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -8,4 +8,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
-
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
\ No newline at end of file
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3d452443c545bb928d857eb2f933cb2fdf5dae73..e9b1aa0afe0ec4dd235135f05ce7bd5d5ea960ad 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2019 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #ifndef __EROFS_FS_COMPRESS_H
 #define __EROFS_FS_COMPRESS_H
@@ -57,4 +56,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
 		       struct list_head *pagepool);
 
 #endif
-
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ea4f693bee2247327f870af0e6ba3da77b487e58..e228a2d52aaf87e4f0f0e4e9268c58b5ae372c94 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021, Alibaba Cloud
  */
 #include "internal.h"
 #include <linux/prefetch.h>
@@ -32,44 +32,82 @@ static void erofs_readendio(struct bio *bio)
 	bio_put(bio);
 }
 
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+static struct page *erofs_read_meta_page(struct super_block *sb, pgoff_t index)
 {
-	struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct address_space *mapping;
 	struct page *page;
 
-	page = read_cache_page_gfp(mapping, blkaddr,
+	if (erofs_is_fscache_mode(sb))
+		mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
+	else
+		mapping = sb->s_bdev->bd_inode->i_mapping;
+	page = read_cache_page_gfp(mapping, index,
 				   mapping_gfp_constraint(mapping, ~__GFP_FS));
-	/* should already be PageUptodate */
-	if (!IS_ERR(page))
-		lock_page(page);
 	return page;
 }
 
+void erofs_unmap_metabuf(struct erofs_buf *buf)
+{
+	if (buf->kmap_type == EROFS_KMAP)
+		kunmap(buf->page);
+	else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
+		kunmap_atomic(buf->base);
+	buf->base = NULL;
+	buf->kmap_type = EROFS_NO_KMAP;
+}
+
+void erofs_put_metabuf(struct erofs_buf *buf)
+{
+	if (!buf->page)
+		return;
+	erofs_unmap_metabuf(buf);
+	put_page(buf->page);
+	buf->page = NULL;
+}
+
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+			erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+	erofs_off_t offset = blknr_to_addr(blkaddr);
+	pgoff_t index = offset >> PAGE_SHIFT;
+	struct page *page = buf->page;
+
+	if (!page || page->index != index) {
+		erofs_put_metabuf(buf);
+		page = erofs_read_meta_page(sb, index);
+		if (IS_ERR(page))
+			return page;
+		/* should already be PageUptodate, no need to lock page */
+		buf->page = page;
+	}
+	if (buf->kmap_type == EROFS_NO_KMAP) {
+		if (type == EROFS_KMAP)
+			buf->base = kmap(page);
+		else if (type == EROFS_KMAP_ATOMIC)
+			buf->base = kmap_atomic(page);
+		buf->kmap_type = type;
+	} else if (buf->kmap_type != type) {
+		DBG_BUGON(1);
+		return ERR_PTR(-EFAULT);
+	}
+	if (type == EROFS_NO_KMAP)
+		return NULL;
+	return buf->base + (offset & ~PAGE_MASK);
+}
+
 static int erofs_map_blocks_flatmode(struct inode *inode,
-				     struct erofs_map_blocks *map,
-				     int flags)
+				     struct erofs_map_blocks *map)
 {
-	int err = 0;
 	erofs_blk_t nblocks, lastblk;
 	u64 offset = map->m_la;
 	struct erofs_inode *vi = EROFS_I(inode);
 	bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
 
-	trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
-
-	nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+	nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
 	lastblk = nblocks - tailendpacking;
 
-	if (offset >= inode->i_size) {
-		/* leave out-of-bound access unmapped */
-		map->m_flags = 0;
-		map->m_plen = 0;
-		goto out;
-	}
-
 	/* there is no hole in flatmode */
 	map->m_flags = EROFS_MAP_MAPPED;
-
 	if (offset < blknr_to_addr(lastblk)) {
 		map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
 		map->m_plen = blknr_to_addr(lastblk) - offset;
@@ -81,47 +119,144 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 			vi->xattr_isize + erofs_blkoff(map->m_la);
 		map->m_plen = inode->i_size - offset;
 
-		/* inline data should be located in one meta block */
-		if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+		/* inline data should be located in the same meta block */
+		if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
 			erofs_err(inode->i_sb,
 				  "inline data cross block boundary @ nid %llu",
 				  vi->nid);
 			DBG_BUGON(1);
-			err = -EFSCORRUPTED;
-			goto err_out;
+			return -EFSCORRUPTED;
 		}
-
 		map->m_flags |= EROFS_MAP_META;
 	} else {
 		erofs_err(inode->i_sb,
 			  "internal error @ nid: %llu (size %llu), m_la 0x%llx",
 			  vi->nid, inode->i_size, map->m_la);
 		DBG_BUGON(1);
-		err = -EIO;
-		goto err_out;
+		return -EIO;
 	}
+	return 0;
+}
 
-out:
-	map->m_llen = map->m_plen;
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
+{
+	struct super_block *sb = inode->i_sb;
+	struct erofs_inode *vi = EROFS_I(inode);
+	struct erofs_inode_chunk_index *idx;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	u64 chunknr;
+	unsigned int unit;
+	erofs_off_t pos;
+	void *kaddr;
+	int err = 0;
 
-err_out:
-	trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
+	trace_erofs_map_blocks_enter(inode, map, 0);
+	map->m_deviceid = 0;
+	if (map->m_la >= inode->i_size) {
+		/* leave out-of-bound access unmapped */
+		map->m_flags = 0;
+		map->m_plen = 0;
+		goto out;
+	}
+
+	if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
+		err = erofs_map_blocks_flatmode(inode, map);
+		goto out;
+	}
+
+	if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
+		unit = sizeof(*idx);			/* chunk index */
+	else
+		unit = EROFS_BLOCK_MAP_ENTRY_SIZE;	/* block map */
+
+	chunknr = map->m_la >> vi->chunkbits;
+	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+		    vi->xattr_isize, unit) + unit * chunknr;
+
+	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+	if (IS_ERR(kaddr)) {
+		err = PTR_ERR(kaddr);
+		goto out;
+	}
+	map->m_la = chunknr << vi->chunkbits;
+	map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+			    roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
+
+	/* handle block map */
+	if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
+		__le32 *blkaddr = kaddr + erofs_blkoff(pos);
+
+		if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
+			map->m_flags = 0;
+		} else {
+			map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
+			map->m_flags = EROFS_MAP_MAPPED;
+		}
+		goto out_unlock;
+	}
+	/* parse chunk indexes */
+	idx = kaddr + erofs_blkoff(pos);
+	switch (le32_to_cpu(idx->blkaddr)) {
+	case EROFS_NULL_ADDR:
+		map->m_flags = 0;
+		break;
+	default:
+		map->m_deviceid = le16_to_cpu(idx->device_id) &
+			EROFS_SB(sb)->device_id_mask;
+		map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
+		map->m_flags = EROFS_MAP_MAPPED;
+		break;
+	}
+out_unlock:
+	erofs_put_metabuf(&buf);
+out:
+	if (!err)
+		map->m_llen = map->m_plen;
+	trace_erofs_map_blocks_exit(inode, map, 0, err);
 	return err;
 }
 
-int erofs_map_blocks(struct inode *inode,
-		     struct erofs_map_blocks *map, int flags)
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 {
-	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
-		int err = z_erofs_map_blocks_iter(inode, map, flags);
-
-		if (map->mpage) {
-			put_page(map->mpage);
-			map->mpage = NULL;
+	struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+	struct erofs_device_info *dif;
+	int id;
+
+	/* primary device by default */
+	map->m_bdev = sb->s_bdev;
+	map->m_fscache = EROFS_SB(sb)->s_fscache;
+
+	if (map->m_deviceid) {
+		down_read(&devs->rwsem);
+		dif = idr_find(&devs->tree, map->m_deviceid - 1);
+		if (!dif) {
+			up_read(&devs->rwsem);
+			return -ENODEV;
 		}
-		return err;
+		map->m_bdev = dif->bdev;
+		map->m_fscache = dif->fscache;
+		up_read(&devs->rwsem);
+	} else if (devs->extra_devices) {
+		down_read(&devs->rwsem);
+		idr_for_each_entry(&devs->tree, dif, id) {
+			erofs_off_t startoff, length;
+
+			if (!dif->mapped_blkaddr)
+				continue;
+			startoff = blknr_to_addr(dif->mapped_blkaddr);
+			length = blknr_to_addr(dif->blocks);
+
+			if (map->m_pa >= startoff &&
+			    map->m_pa < startoff + length) {
+				map->m_pa -= startoff;
+				map->m_bdev = dif->bdev;
+				map->m_fscache = dif->fscache;
+				break;
+			}
+		}
+		up_read(&devs->rwsem);
 	}
-	return erofs_map_blocks_flatmode(inode, map, flags);
+	return 0;
 }
 
 static inline struct bio *erofs_read_raw_page(struct bio *bio,
@@ -134,6 +269,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
 	struct inode *const inode = mapping->host;
 	struct super_block *const sb = inode->i_sb;
 	erofs_off_t current_block = (erofs_off_t)page->index;
+	erofs_off_t pos = blknr_to_addr(current_block);
 	int err;
 
 	DBG_BUGON(!nblocks);
@@ -154,12 +290,21 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
 
 	if (!bio) {
 		struct erofs_map_blocks map = {
-			.m_la = blknr_to_addr(current_block),
+			.m_la = pos,
 		};
+		struct erofs_map_dev mdev;
 		erofs_blk_t blknr;
 		unsigned int blkoff;
 
-		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+		err = erofs_map_blocks(inode, &map);
+		if (err)
+			goto err_out;
+
+		mdev = (struct erofs_map_dev) {
+			.m_deviceid = map.m_deviceid,
+			.m_pa = map.m_pa,
+		};
+		err = erofs_map_dev(sb, &mdev);
 		if (err)
 			goto err_out;
 
@@ -175,36 +320,31 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		DBG_BUGON(map.m_plen != map.m_llen);
 
-		blknr = erofs_blknr(map.m_pa);
-		blkoff = erofs_blkoff(map.m_pa);
+		blknr = erofs_blknr(mdev.m_pa + (pos - map.m_la));
+		blkoff = erofs_blkoff(mdev.m_pa + (pos - map.m_la));
 
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
-			struct page *ipage;
+			struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 
 			DBG_BUGON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb, blknr);
-
-			if (IS_ERR(ipage)) {
-				err = PTR_ERR(ipage);
+			vsrc = erofs_read_metabuf(&buf, inode->i_sb,
+						  blknr, EROFS_KMAP_ATOMIC);
+			if (IS_ERR(vsrc)) {
+				err = PTR_ERR(vsrc);
 				goto err_out;
 			}
-
-			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
 			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
-			kunmap_atomic(vsrc);
 			flush_dcache_page(page);
 
 			SetPageUptodate(page);
-			/* TODO: could we unlock the page earlier? */
-			unlock_page(ipage);
-			put_page(ipage);
 
+			erofs_put_metabuf(&buf);
 			/* imply err = 0, see erofs_map_blocks */
 			goto has_updated;
 		}
@@ -221,7 +361,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
 		bio = bio_alloc(GFP_NOIO, nblocks);
 
 		bio->bi_end_io = erofs_readendio;
-		bio_set_dev(bio, sb->s_bdev);
+		bio_set_dev(bio, mdev.m_bdev);
 		bio->bi_iter.bi_sector = (sector_t)blknr <<
 			LOG_SECTORS_PER_BLOCK;
 		bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
@@ -326,7 +466,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
 			return 0;
 	}
 
-	if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+	if (!erofs_map_blocks(inode, &map))
 		return erofs_blknr(map.m_pa);
 
 	return 0;
@@ -338,4 +478,3 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.readahead = erofs_raw_access_readahead,
 	.bmap = erofs_bmap,
 };
-
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 36693924db182bdd3819846712adb60b17c9a42d..45be046c322fe960d5e9f429e25502b861c479c3 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2019 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "compress.h"
 #include <linux/module.h>
@@ -134,8 +133,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out,
 	support_0padding = false;
 
 	/* decompression inplace is only safe when 0padding is enabled */
-	if (EROFS_SB(rq->sb)->feature_incompat &
-	    EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+	if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
 		support_0padding = true;
 
 		while (!src[inputmargin & ~PAGE_MASK])
@@ -350,4 +348,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
 		return z_erofs_shifted_transform(rq, pagepool);
 	return z_erofs_decompress_generic(rq, pagepool);
 }
-
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2776bb832127d79d55c91da1b0a1a6fe4705a27e..eee9b0b31b639b17f35e63158abfa5b7cd72dcf8 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "internal.h"
 
@@ -139,4 +138,3 @@ const struct file_operations erofs_dir_fops = {
 	.read		= generic_read_dir,
 	.iterate_shared	= erofs_readdir,
 };
-
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index e8d04d808fa628d6206bbc0339f0471953faebae..9e271297642994fa1dcc9623f047701679ed16ba 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -4,7 +4,7 @@
  *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021, Alibaba Cloud
  */
 #ifndef __EROFS_FS_H
 #define __EROFS_FS_H
@@ -18,7 +18,22 @@
  * be incompatible with this kernel version.
  */
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
-#define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE	0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE	0x00000008
+#define EROFS_ALL_FEATURE_INCOMPAT		\
+	(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+	 EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+	 EROFS_FEATURE_INCOMPAT_DEVICE_TABLE)
+
+#define EROFS_SB_EXTSLOT_SIZE	16
+
+struct erofs_deviceslot {
+	u8 tag[64];		/* digest(sha256), etc. */
+	__le32 blocks;		/* total fs blocks of this device */
+	__le32 mapped_blkaddr;	/* map starting at mapped_blkaddr */
+	u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE	sizeof(struct erofs_deviceslot)
 
 /* 128-byte erofs on-disk super block */
 struct erofs_super_block {
@@ -39,7 +54,10 @@ struct erofs_super_block {
 	__u8 uuid[16];          /* 128-bit uuid for volume */
 	__u8 volume_name[16];   /* volume name */
 	__le32 feature_incompat;
-	__u8 reserved2[44];
+	__le16 reserved2;
+	__le16 extra_devices;	/* # of devices besides the primary device */
+	__le16 devt_slotoff;	/* startoff = devt_slotoff * devt_slotsize */
+	__u8 reserved3[38];
 };
 
 /*
@@ -52,13 +70,16 @@ struct erofs_super_block {
  * inode, [xattrs], last_inline_data, ... | ... | no-holed data
  * 3 - inode compression D:
  * inode, [xattrs], map_header, extents ... | ...
- * 4~7 - reserved
+ * 4 - inode chunk-based E:
+ * inode, [xattrs], chunk indexes ... | ...
+ * 5~7 - reserved
  */
 enum {
 	EROFS_INODE_FLAT_PLAIN			= 0,
 	EROFS_INODE_FLAT_COMPRESSION_LEGACY	= 1,
 	EROFS_INODE_FLAT_INLINE			= 2,
 	EROFS_INODE_FLAT_COMPRESSION		= 3,
+	EROFS_INODE_CHUNK_BASED			= 4,
 	EROFS_INODE_DATALAYOUT_MAX
 };
 
@@ -78,6 +99,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
 #define EROFS_I_ALL	\
 	((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
 
+/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
+#define EROFS_CHUNK_FORMAT_BLKBITS_MASK		0x001F
+/* with chunk indexes or just a 4-byte blkaddr array */
+#define EROFS_CHUNK_FORMAT_INDEXES		0x0020
+
+#define EROFS_CHUNK_FORMAT_ALL	\
+	(EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+
+struct erofs_inode_chunk_info {
+	__le16 format;		/* chunk blkbits, etc. */
+	__le16 reserved;
+};
+
 /* 32-byte reduced form of an ondisk inode */
 struct erofs_inode_compact {
 	__le16 i_format;	/* inode format hints */
@@ -95,6 +129,9 @@ struct erofs_inode_compact {
 
 		/* for device files, used to indicate old/new device # */
 		__le32 rdev;
+
+		/* for chunk-based files, it contains the summary info */
+		struct erofs_inode_chunk_info c;
 	} i_u;
 	__le32 i_ino;           /* only used for 32-bit stat compatibility */
 	__le16 i_uid;
@@ -123,6 +160,9 @@ struct erofs_inode_extended {
 
 		/* for device files, used to indicate old/new device # */
 		__le32 rdev;
+
+		/* for chunk-based files, it contains the summary info */
+		struct erofs_inode_chunk_info c;
 	} i_u;
 
 	/* only used for 32-bit stat compatibility */
@@ -192,6 +232,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
 				 e->e_name_len + le16_to_cpu(e->e_value_size));
 }
 
+/* represent a zeroed chunk (hole) */
+#define EROFS_NULL_ADDR			-1
+
+/* 4-byte block address array */
+#define EROFS_BLOCK_MAP_ENTRY_SIZE	sizeof(__le32)
+
+/* 8-byte inode chunk indexes */
+struct erofs_inode_chunk_index {
+	__le16 advise;		/* always 0, don't care for now */
+	__le16 device_id;	/* back-end storage id (with bits masked) */
+	__le32 blkaddr;		/* start block address of this inode chunk */
+};
+
 /* available compression algorithm types (for h_algorithmtype) */
 enum {
 	Z_EROFS_COMPRESSION_LZ4	= 0,
@@ -308,13 +361,18 @@ static inline void erofs_check_ondisk_layout_definitions(void)
 	BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
 	BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
 	BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
 	BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
 	BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
 	BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+	/* keep in sync between 2 index structures for better extendibility */
+	BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
+		     sizeof(struct z_erofs_vle_decompressed_index));
+	BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
 
 	BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
 		     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
 }
 
 #endif
-
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 0000000000000000000000000000000000000000..cfa000c6175ba6bc642f6ac80523bce614524bad
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
+ */
+#include <linux/pseudo_fs.h>
+#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
+#include <linux/mount.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
+static LIST_HEAD(erofs_domain_list);
+static LIST_HEAD(erofs_domain_cookies_list);
+static struct vfsmount *erofs_pseudo_mnt;
+
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+	return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type erofs_anon_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "pseudo_erofs",
+	.init_fs_context = erofs_anon_init_fs_context,
+	.kill_sb	= kill_anon_super,
+};
+
+const struct fscache_cookie_def erofs_fscache_super_index_def = {
+	.name = "EROFS.super",
+	.type = FSCACHE_COOKIE_TYPE_INDEX,
+	.check_aux = NULL,
+	.new_version = true,
+};
+
+const struct fscache_cookie_def erofs_fscache_inode_object_def = {
+	.name           = "EROFS.uniqueid",
+	.type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+};
+
+static void erofs_readpage_from_fscache_complete(struct page *page, void *ctx,
+						 int error)
+{
+	if (!error)
+		SetPageUptodate(page);
+	unlock_page(page);
+}
+
+static void erofs_readahead_from_fscache_complete(struct page *page, void *ctx,
+						 int error)
+{
+	erofs_readpage_from_fscache_complete(page, ctx, error);
+	put_page(page);
+}
+
+static int erofs_fscache_meta_readpage(struct file *data, struct page *page)
+{
+	int ret;
+	struct super_block *sb = page->mapping->host->i_sb;
+	struct erofs_fscache *ctx = page->mapping->host->i_private;
+
+	ret = fscache_read_or_alloc_page(ctx->cookie, page,
+					 erofs_readpage_from_fscache_complete,
+					 NULL,
+					 GFP_KERNEL);
+	switch (ret) {
+	case 0: /* page found in fscache, read submitted */
+		erofs_dbg("%s: submitted", __func__);
+		return ret;
+	case -ENOBUFS:	/* page won't be cached */
+	case -ENODATA:	/* page not in cache */
+		erofs_err(sb, "%s: %d", __func__, ret);
+		ret = -EIO;
+		goto out;
+	default:
+		erofs_err(sb, "unknown error ret = %d", ret);
+	}
+
+out:
+	unlock_page(page);
+	return ret;
+}
+
+static int erofs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+	if (WARN_ON(PagePrivate(page)))
+		return 0;
+
+	ClearPageFsCache(page);
+	return 1;
+}
+
+static void erofs_fscache_invalidate_page(struct page *page, unsigned int offset,
+					  unsigned int length)
+{
+	if (offset == 0 && length == PAGE_SIZE)
+		ClearPageFsCache(page);
+}
+
+static int erofs_fscache_readpage_inline(struct page *page,
+					 struct erofs_map_blocks *map)
+{
+	struct super_block *sb = page->mapping->host->i_sb;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	erofs_blk_t blknr;
+	size_t offset, len;
+	void *src, *dst;
+
+	/* For tail packing layout, the offset may be non-zero. */
+	offset = erofs_blkoff(map->m_pa);
+	blknr = erofs_blknr(map->m_pa);
+	len = map->m_llen;
+
+	src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP_ATOMIC);
+	if (IS_ERR(src))
+		return PTR_ERR(src);
+
+	dst = kmap_atomic(page);
+	memcpy(dst, src + offset, len);
+	memset(dst + len, 0, PAGE_SIZE - len);
+	kunmap_atomic(dst);
+
+	erofs_put_metabuf(&buf);
+	SetPageUptodate(page);
+	return 0;
+}
+
+static int erofs_fscache_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct erofs_map_blocks map;
+	struct erofs_map_dev mdev;
+	erofs_off_t pos = page_offset(page);
+	loff_t pstart;
+	int ret;
+
+	map.m_la = pos;
+	ret = erofs_map_blocks(inode, &map);
+	if (ret)
+		goto out_unlock;
+
+	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+		zero_user_segment(page, 0, PAGE_SIZE);
+		SetPageUptodate(page);
+		goto out_unlock;
+	}
+
+	if (map.m_flags & EROFS_MAP_META) {
+		ret = erofs_fscache_readpage_inline(page, &map);
+		goto out_unlock;
+	}
+
+	mdev = (struct erofs_map_dev) {
+		.m_deviceid = map.m_deviceid,
+		.m_pa = map.m_pa,
+	};
+
+	ret = erofs_map_dev(sb, &mdev);
+	if (ret)
+		goto out_unlock;
+
+	pstart = mdev.m_pa + (pos - map.m_la);
+	ret = fscache_read_or_alloc_page2(mdev.m_fscache->cookie, page,
+					 erofs_readpage_from_fscache_complete,
+					 NULL,
+					 GFP_KERNEL, pstart);
+	switch (ret) {
+	case 0: /* page found in fscache, read submitted */
+		erofs_dbg("%s: submitted", __func__);
+		return ret;
+	case -ENOBUFS:	/* page won't be cached */
+	case -ENODATA:	/* page not in cache */
+		erofs_err(sb, "%s: %d", __func__, ret);
+		ret = -EIO;
+		goto out_unlock;
+	default:
+		erofs_err(sb, "unknown error ret = %d", ret);
+	}
+
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+	struct inode *inode = rac->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	size_t len, count, done = 0;
+	erofs_off_t pos;
+	loff_t start, start_pos;
+	int ret;
+
+	if (!readahead_count(rac))
+		return;
+
+	start = readahead_pos(rac);
+	len = readahead_length(rac);
+
+	do {
+		struct erofs_map_blocks map;
+		struct erofs_map_dev mdev;
+
+		pos = start + done;
+
+		map.m_la = pos;
+		ret = erofs_map_blocks(inode, &map);
+		if (ret)
+			return;
+
+		if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+			page = readahead_page(rac);
+			zero_user_segment(page, 0, PAGE_SIZE);
+			SetPageUptodate(page);
+			unlock_page(page);
+			put_page(page);
+			done += PAGE_SIZE;
+			continue;
+		}
+
+		if (map.m_flags & EROFS_MAP_META) {
+			page = readahead_page(rac);
+			ret = erofs_fscache_readpage_inline(page, &map);
+			unlock_page(page);
+			put_page(page);
+			done += PAGE_SIZE;
+			continue;
+		}
+
+		mdev = (struct erofs_map_dev) {
+			.m_deviceid = map.m_deviceid,
+			.m_pa = map.m_pa,
+		};
+
+		ret = erofs_map_dev(sb, &mdev);
+		if (ret)
+			return;
+
+		start_pos = mdev.m_pa + (pos - map.m_la);
+		count = min_t(size_t, map.m_llen - (pos - map.m_la), len - done);
+		ret = fscache_prepare_read(mdev.m_fscache->cookie, rac->mapping,
+				pos / PAGE_SIZE, count / PAGE_SIZE, start_pos,
+				erofs_readahead_from_fscache_complete, NULL);
+		if (ret) {
+			erofs_err(sb, "%s: prepare_read %d", __func__, ret);
+			return;
+		}
+
+		done += count;
+		count /= PAGE_SIZE;
+		rac->_nr_pages -= count;
+		rac->_index += count;
+	} while (done < len);
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+	.readpage = erofs_fscache_meta_readpage,
+	.releasepage = erofs_fscache_release_page,
+	.invalidatepage = erofs_fscache_invalidate_page,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+	.readpage = erofs_fscache_readpage,
+	.readahead = erofs_fscache_readahead,
+	.releasepage = erofs_fscache_release_page,
+	.invalidatepage = erofs_fscache_invalidate_page,
+};
+
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+	mutex_lock(&erofs_domain_list_lock);
+	if (refcount_dec_and_test(&domain->ref)) {
+		list_del(&domain->list);
+		if (list_empty(&erofs_domain_list)) {
+			kern_unmount(erofs_pseudo_mnt);
+			erofs_pseudo_mnt = NULL;
+		}
+		fscache_relinquish_cookie(domain->volume, NULL, false);
+		mutex_unlock(&erofs_domain_list_lock);
+		kfree(domain->domain_id);
+		kfree(domain);
+		return;
+	}
+	mutex_unlock(&erofs_domain_list_lock);
+}
+
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	char *domain_id = sbi->domain_id;
+	struct fscache_cookie *volume;
+	char *name;
+	int ret = 0;
+
+	name = kasprintf(GFP_KERNEL, "erofs,%s",
+			 domain_id ? domain_id : sbi->fsid);
+	if (!name)
+		return -ENOMEM;
+
+	volume = fscache_acquire_cookie(&fscache_fsdef_index,
+			&erofs_fscache_super_index_def, name, strlen(name),
+			NULL, 0, NULL, 0, true);
+	if (IS_ERR_OR_NULL(volume)) {
+		erofs_err(sb, "failed to register volume for %s", name);
+		ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+		volume = NULL;
+	}
+
+	sbi->volume = volume;
+	kfree(name);
+	return ret;
+}
+
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+	int err;
+	struct erofs_domain *domain;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+	if (!domain)
+		return -ENOMEM;
+
+	domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL);
+	if (!domain->domain_id) {
+		kfree(domain);
+		return -ENOMEM;
+	}
+
+	err = erofs_fscache_register_volume(sb);
+	if (err)
+		goto out;
+
+	if (!erofs_pseudo_mnt) {
+		struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type);
+
+		if (IS_ERR(mnt)) {
+			err = PTR_ERR(mnt);
+			goto out;
+		}
+		erofs_pseudo_mnt = mnt;
+	}
+
+	domain->volume = sbi->volume;
+	refcount_set(&domain->ref, 1);
+	list_add(&domain->list, &erofs_domain_list);
+	sbi->domain = domain;
+	return 0;
+out:
+	kfree(domain->domain_id);
+	kfree(domain);
+	return err;
+}
+
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+	int err;
+	struct erofs_domain *domain;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_lock(&erofs_domain_list_lock);
+	list_for_each_entry(domain, &erofs_domain_list, list) {
+		if (!strcmp(domain->domain_id, sbi->domain_id)) {
+			sbi->domain = domain;
+			sbi->volume = domain->volume;
+			refcount_inc(&domain->ref);
+			mutex_unlock(&erofs_domain_list_lock);
+			return 0;
+		}
+	}
+	err = erofs_fscache_init_domain(sb);
+	mutex_unlock(&erofs_domain_list_lock);
+	return err;
+}
+
+static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+						char *name, unsigned int flags)
+{
+	struct erofs_fscache *ctx;
+	struct fscache_cookie *cookie;
+	struct super_block *isb;
+	struct inode *inode;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&ctx->node);
+	refcount_set(&ctx->ref, 1);
+
+	cookie = fscache_acquire_cookie(EROFS_SB(sb)->volume,
+					&erofs_fscache_inode_object_def,
+					name, strlen(name),
+					NULL, 0, NULL, 0, true);
+	if (!cookie) {
+		erofs_err(sb, "failed to get cookie for %s", name);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	//fscache_use_cookie(cookie, false);
+
+	/*
+	 * Allocate anonymous inode in global pseudo mount for shareable blobs,
+	 * so that they are accessible among erofs fs instances.
+	 */
+	isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb;
+	inode = new_inode(isb);
+	if (!inode) {
+		erofs_err(sb, "failed to get anon inode for %s", name);
+		ret = -ENOMEM;
+		goto err_cookie;
+	}
+
+	inode->i_size = OFFSET_MAX;
+	inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	inode->i_private = ctx;
+
+	ctx->cookie = cookie;
+	ctx->inode = inode;
+	return ctx;
+
+err_cookie:
+//	fscache_unuse_cookie(cookie, NULL, NULL);
+	fscache_relinquish_cookie(cookie, NULL, false);
+err:
+	kfree(ctx);
+	return ERR_PTR(ret);
+}
+
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
+{
+	//fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+	fscache_relinquish_cookie(ctx->cookie, NULL, false);
+	iput(ctx->inode);
+	kfree(ctx->name);
+	kfree(ctx);
+}
+
+static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb,
+						char *name, unsigned int flags)
+{
+	struct erofs_fscache *ctx;
+	struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+	ctx = erofs_fscache_acquire_cookie(sb, name, flags);
+	if (IS_ERR(ctx))
+		return ctx;
+
+	ctx->name = kstrdup(name, GFP_KERNEL);
+	if (!ctx->name) {
+		erofs_fscache_relinquish_cookie(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	refcount_inc(&domain->ref);
+	ctx->domain = domain;
+	list_add(&ctx->node, &erofs_domain_cookies_list);
+	return ctx;
+}
+
+static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+						char *name, unsigned int flags)
+{
+	struct erofs_fscache *ctx;
+	struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+	flags |= EROFS_REG_COOKIE_SHARE;
+	mutex_lock(&erofs_domain_cookies_lock);
+	list_for_each_entry(ctx, &erofs_domain_cookies_list, node) {
+		if (ctx->domain != domain || strcmp(ctx->name, name))
+			continue;
+		if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+			refcount_inc(&ctx->ref);
+		} else {
+			erofs_err(sb, "%s already exists in domain %s", name,
+				  domain->domain_id);
+			ctx = ERR_PTR(-EEXIST);
+		}
+		mutex_unlock(&erofs_domain_cookies_lock);
+		return ctx;
+	}
+	ctx = erofs_domain_init_cookie(sb, name, flags);
+	mutex_unlock(&erofs_domain_cookies_lock);
+	return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						    char *name,
+						    unsigned int flags)
+{
+	if (EROFS_SB(sb)->domain_id)
+		return erofs_domain_register_cookie(sb, name, flags);
+	return erofs_fscache_acquire_cookie(sb, name, flags);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+	struct erofs_domain *domain = NULL;
+
+	if (!ctx)
+		return;
+	if (!ctx->domain)
+		return erofs_fscache_relinquish_cookie(ctx);
+
+	mutex_lock(&erofs_domain_cookies_lock);
+	if (refcount_dec_and_test(&ctx->ref)) {
+		domain = ctx->domain;
+		list_del(&ctx->node);
+		erofs_fscache_relinquish_cookie(ctx);
+	}
+	mutex_unlock(&erofs_domain_cookies_lock);
+	if (domain)
+		erofs_fscache_domain_put(domain);
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+	int ret;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_fscache *fscache;
+	unsigned int flags = 0;
+
+	if (sbi->domain_id)
+		ret = erofs_fscache_register_domain(sb);
+	else
+		ret = erofs_fscache_register_volume(sb);
+	if (ret)
+		return ret;
+
+	/*
+	 * When shared domain is enabled, using NEED_NOEXIST to guarantee
+	 * the primary data blob (aka fsid) is unique in the shared domain.
+	 *
+	 * For non-shared-domain case, fscache_acquire_volume() invoked by
+	 * erofs_fscache_register_volume() has already guaranteed
+	 * the uniqueness of primary data blob.
+	 *
+	 * Acquired domain/volume will be relinquished in kill_sb() on error.
+	 */
+	if (sbi->domain_id)
+		flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+	fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
+	if (IS_ERR(fscache))
+		return PTR_ERR(fscache);
+
+	sbi->s_fscache = fscache;
+	return 0;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	erofs_fscache_unregister_cookie(sbi->s_fscache);
+
+	if (sbi->domain)
+		erofs_fscache_domain_put(sbi->domain);
+	else
+		fscache_relinquish_cookie(sbi->volume, NULL, false);
+
+	sbi->s_fscache = NULL;
+	sbi->volume = NULL;
+	sbi->domain = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 0a94a52a119fbcb89f2d8a1e8662ee362dc2a0a9..dbdee1964388b223238b9cea2e6104dd8be9f8d6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021, Alibaba Cloud
  */
 #include "xattr.h"
 
@@ -13,8 +13,8 @@
  * the inode payload page if it's an extended inode) in order to fill
  * inline data if possible.
  */
-static struct page *erofs_read_inode(struct inode *inode,
-				     unsigned int *ofs)
+static void *erofs_read_inode(struct erofs_buf *buf,
+			      struct inode *inode, unsigned int *ofs)
 {
 	struct super_block *sb = inode->i_sb;
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -22,7 +22,7 @@ static struct page *erofs_read_inode(struct inode *inode,
 	const erofs_off_t inode_loc = iloc(sbi, vi->nid);
 
 	erofs_blk_t blkaddr, nblks = 0;
-	struct page *page;
+	void *kaddr;
 	struct erofs_inode_compact *dic;
 	struct erofs_inode_extended *die, *copied = NULL;
 	unsigned int ifmt;
@@ -34,14 +34,14 @@ static struct page *erofs_read_inode(struct inode *inode,
 	erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
 		  __func__, vi->nid, *ofs, blkaddr);
 
-	page = erofs_get_meta_page(sb, blkaddr);
-	if (IS_ERR(page)) {
+	kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+	if (IS_ERR(kaddr)) {
 		erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
-			  vi->nid, PTR_ERR(page));
-		return page;
+			  vi->nid, PTR_ERR(kaddr));
+		return kaddr;
 	}
 
-	dic = page_address(page) + *ofs;
+	dic = kaddr + *ofs;
 	ifmt = le16_to_cpu(dic->i_format);
 
 	if (ifmt & ~EROFS_I_ALL) {
@@ -62,12 +62,12 @@ static struct page *erofs_read_inode(struct inode *inode,
 	switch (erofs_inode_version(ifmt)) {
 	case EROFS_INODE_LAYOUT_EXTENDED:
 		vi->inode_isize = sizeof(struct erofs_inode_extended);
-		/* check if the inode acrosses page boundary */
-		if (*ofs + vi->inode_isize <= PAGE_SIZE) {
+		/* check if the extended inode acrosses block boundary */
+		if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
 			*ofs += vi->inode_isize;
 			die = (struct erofs_inode_extended *)dic;
 		} else {
-			const unsigned int gotten = PAGE_SIZE - *ofs;
+			const unsigned int gotten = EROFS_BLKSIZ - *ofs;
 
 			copied = kmalloc(vi->inode_isize, GFP_NOFS);
 			if (!copied) {
@@ -75,18 +75,16 @@ static struct page *erofs_read_inode(struct inode *inode,
 				goto err_out;
 			}
 			memcpy(copied, dic, gotten);
-			unlock_page(page);
-			put_page(page);
-
-			page = erofs_get_meta_page(sb, blkaddr + 1);
-			if (IS_ERR(page)) {
-				erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld",
-					  vi->nid, PTR_ERR(page));
+			kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+						   EROFS_KMAP);
+			if (IS_ERR(kaddr)) {
+				erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
+					  vi->nid, PTR_ERR(kaddr));
 				kfree(copied);
-				return page;
+				return kaddr;
 			}
 			*ofs = vi->inode_isize - gotten;
-			memcpy((u8 *)copied + gotten, page_address(page), *ofs);
+			memcpy((u8 *)copied + gotten, kaddr, *ofs);
 			die = copied;
 		}
 		vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
@@ -123,8 +121,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 		/* total blocks for compressed files */
 		if (erofs_inode_is_data_compressed(vi->datalayout))
 			nblks = le32_to_cpu(die->i_u.compressed_blocks);
-
+		else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+			/* fill chunked inode summary info */
+			vi->chunkformat = le16_to_cpu(die->i_u.c.format);
 		kfree(copied);
+		copied = NULL;
 		break;
 	case EROFS_INODE_LAYOUT_COMPACT:
 		vi->inode_isize = sizeof(struct erofs_inode_compact);
@@ -161,6 +162,8 @@ static struct page *erofs_read_inode(struct inode *inode,
 		inode->i_size = le32_to_cpu(dic->i_size);
 		if (erofs_inode_is_data_compressed(vi->datalayout))
 			nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+		else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+			vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
 		break;
 	default:
 		erofs_err(inode->i_sb,
@@ -170,6 +173,17 @@ static struct page *erofs_read_inode(struct inode *inode,
 		goto err_out;
 	}
 
+	if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+		if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
+			erofs_err(inode->i_sb,
+				  "unsupported chunk format %x of nid %llu",
+				  vi->chunkformat, vi->nid);
+			err = -EOPNOTSUPP;
+			goto err_out;
+		}
+		vi->chunkbits = LOG_BLOCK_SIZE +
+			(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
+	}
 	inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
 	inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
@@ -180,7 +194,7 @@ static struct page *erofs_read_inode(struct inode *inode,
 		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
 	else
 		inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
-	return page;
+	return kaddr;
 
 bogusimode:
 	erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
@@ -189,12 +203,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 err_out:
 	DBG_BUGON(1);
 	kfree(copied);
-	unlock_page(page);
-	put_page(page);
+	erofs_put_metabuf(buf);
 	return ERR_PTR(err);
 }
 
-static int erofs_fill_symlink(struct inode *inode, void *data,
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
 			      unsigned int m_pofs)
 {
 	struct erofs_inode *vi = EROFS_I(inode);
@@ -202,7 +215,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
 
 	/* if it cannot be handled with fast symlink scheme */
 	if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
-	    inode->i_size >= PAGE_SIZE) {
+	    inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
 		inode->i_op = &erofs_symlink_iops;
 		return 0;
 	}
@@ -212,8 +225,8 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
 		return -ENOMEM;
 
 	m_pofs += vi->xattr_isize;
-	/* inline symlink data shouldn't cross page boundary as well */
-	if (m_pofs + inode->i_size > PAGE_SIZE) {
+	/* inline symlink data shouldn't cross block boundary */
+	if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
 		kfree(lnk);
 		erofs_err(inode->i_sb,
 			  "inline data cross block boundary @ nid %llu",
@@ -221,8 +234,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
 		DBG_BUGON(1);
 		return -EFSCORRUPTED;
 	}
-
-	memcpy(lnk, data + m_pofs, inode->i_size);
+	memcpy(lnk, kaddr + m_pofs, inode->i_size);
 	lnk[inode->i_size] = '\0';
 
 	inode->i_link = lnk;
@@ -233,16 +245,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
 static int erofs_fill_inode(struct inode *inode, int isdir)
 {
 	struct erofs_inode *vi = EROFS_I(inode);
-	struct page *page;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	void *kaddr;
 	unsigned int ofs;
 	int err = 0;
 
 	trace_erofs_fill_inode(inode, isdir);
 
 	/* read inode base data from disk */
-	page = erofs_read_inode(inode, &ofs);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
+	kaddr = erofs_read_inode(&buf, inode, &ofs);
+	if (IS_ERR(kaddr))
+		return PTR_ERR(kaddr);
 
 	/* setup the new inode */
 	switch (inode->i_mode & S_IFMT) {
@@ -255,7 +268,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
 		inode->i_fop = &erofs_dir_fops;
 		break;
 	case S_IFLNK:
-		err = erofs_fill_symlink(inode, page_address(page), ofs);
+		err = erofs_fill_symlink(inode, kaddr, ofs);
 		if (err)
 			goto out_unlock;
 		inode_nohighmem(inode);
@@ -273,14 +286,20 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
 	}
 
 	if (erofs_inode_is_data_compressed(vi->datalayout)) {
-		err = z_erofs_fill_inode(inode);
+		if (!erofs_is_fscache_mode(inode->i_sb))
+			err = z_erofs_fill_inode(inode);
+		else
+			err = -EOPNOTSUPP;
 		goto out_unlock;
 	}
 	inode->i_mapping->a_ops = &erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+	if (erofs_is_fscache_mode(inode->i_sb))
+		inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
 
 out_unlock:
-	unlock_page(page);
-	put_page(page);
+	erofs_put_metabuf(&buf);
 	return err;
 }
 
@@ -373,4 +392,3 @@ const struct inode_operations erofs_fast_symlink_iops = {
 	.listxattr = erofs_listxattr,
 	.get_acl = erofs_get_acl,
 };
-
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5b3a9f5c282db496a1e9c346f41db52fbb7cb286..283ae94abbf4db0887d3ae25bc0292c5310b06cd 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021, Alibaba Cloud
  */
 #ifndef __EROFS_INTERNAL_H
 #define __EROFS_INTERNAL_H
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/idr.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -46,7 +47,16 @@ typedef u64 erofs_off_t;
 /* data type for filesystem-wide blocks number */
 typedef u32 erofs_blk_t;
 
-struct erofs_fs_context {
+struct erofs_device_info {
+	char *path;
+	struct erofs_fscache *fscache;
+	struct block_device *bdev;
+
+	u32 blocks;
+	u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* current strategy of how to use managed cache */
 	unsigned char cache_strategy;
@@ -57,7 +67,41 @@ struct erofs_fs_context {
 	unsigned int mount_opt;
 };
 
+struct erofs_dev_context {
+	struct idr tree;
+	struct rw_semaphore rwsem;
+
+	unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+	struct erofs_mount_opts opt;
+	struct erofs_dev_context *devs;
+	char *fsid;
+	char *domain_id;
+	bool ondemand_enabled;
+};
+
+struct erofs_domain {
+	refcount_t ref;
+	struct list_head list;
+	struct fscache_cookie *volume;
+	char *domain_id;
+};
+
+struct erofs_fscache {
+	struct fscache_cookie *cookie;
+	struct inode *inode;	/* anonymous inode for the blob */
+
+	/* used for share domain mode */
+	struct erofs_domain *domain;
+	struct list_head node;
+	refcount_t ref;
+	char *name;
+};
+
 struct erofs_sb_info {
+	struct erofs_mount_opts opt;	/* options */
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
@@ -71,11 +115,15 @@ struct erofs_sb_info {
 	/* pseudo inode to manage cached pages */
 	struct inode *managed_cache;
 #endif	/* CONFIG_EROFS_FS_ZIP */
-	u32 blocks;
+	struct erofs_dev_context *devs;
+	u64 total_blocks;
+	u32 primarydevice_blocks;
+
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
 	u32 xattr_blkaddr;
 #endif
+	u16 device_id_mask;	/* valid bits of device id to be used */
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
@@ -93,7 +141,12 @@ struct erofs_sb_info {
 	u32 feature_compat;
 	u32 feature_incompat;
 
-	struct erofs_fs_context ctx;	/* options */
+	/* fscache support */
+	struct fscache_cookie *volume;
+	struct erofs_fscache *s_fscache;
+	struct erofs_domain *domain;
+	char *fsid;
+	char *domain_id;
 };
 
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -103,9 +156,14 @@ struct erofs_sb_info {
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
 
-#define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option)	((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option)	((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option)	((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option)	((opt)->mount_opt & EROFS_MOUNT_##option)
+
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+	return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
 
 enum {
 	EROFS_ZIP_CACHE_DISABLED,
@@ -176,6 +234,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 #error erofs cannot be used in this platform
 #endif
 
+enum erofs_kmap_type {
+	EROFS_NO_KMAP,		/* don't map the buffer */
+	EROFS_KMAP,		/* use kmap() to map the buffer */
+	EROFS_KMAP_ATOMIC,	/* use kmap_atomic() to map the buffer */
+};
+
+struct erofs_buf {
+	struct page *page;
+	void *base;
+	enum erofs_kmap_type kmap_type;
+};
+#define __EROFS_BUF_INITIALIZER	((struct erofs_buf){ .page = NULL })
+
 #define ROOT_NID(sb)		((sb)->root_nid)
 
 #define erofs_blknr(addr)       ((addr) / EROFS_BLKSIZ)
@@ -187,6 +258,16 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
 	return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
 }
 
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+	return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
 /* atomic flag definitions */
 #define EROFS_I_EA_INITED_BIT	0
 #define EROFS_I_Z_INITED_BIT	1
@@ -210,6 +291,10 @@ struct erofs_inode {
 
 	union {
 		erofs_blk_t raw_blkaddr;
+		struct {
+			unsigned short	chunkformat;
+			unsigned char	chunkbits;
+		};
 #ifdef CONFIG_EROFS_FS_ZIP
 		struct {
 			unsigned short z_advise;
@@ -258,7 +343,7 @@ extern const struct address_space_operations erofs_raw_access_aops;
 extern const struct address_space_operations z_erofs_aops;
 
 /*
- * Logical to physical block mapping, used by erofs_map_blocks()
+ * Logical to physical block mapping
  *
  * Different with other file systems, it is used for 2 access modes:
  *
@@ -297,17 +382,15 @@ enum {
 #define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
 
 struct erofs_map_blocks {
+	struct erofs_buf buf;
+
 	erofs_off_t m_pa, m_la;
 	u64 m_plen, m_llen;
 
+	unsigned short m_deviceid;
 	unsigned int m_flags;
-
-	struct page *mpage;
 };
 
-/* Flags used by erofs_map_blocks() */
-#define EROFS_GET_BLOCKS_RAW    0x0001
-
 /* zmap.c */
 #ifdef CONFIG_EROFS_FS_ZIP
 int z_erofs_fill_inode(struct inode *inode);
@@ -324,10 +407,21 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 }
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
-/* data.c */
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+struct erofs_map_dev {
+	struct erofs_fscache *m_fscache;
+	struct block_device *m_bdev;
+
+	erofs_off_t m_pa;
+	unsigned int m_deviceid;
+};
 
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+/* data.c */
+void erofs_unmap_metabuf(struct erofs_buf *buf);
+void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+			 erofs_blk_t blkaddr, enum erofs_kmap_type type);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
 
 /* inode.c */
 static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -400,7 +494,40 @@ static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_SHARE         0x0001
+#define EROFS_REG_COOKIE_NEED_NOEXIST  0x0002
+
+/* fscache.c */
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						    char *name,
+						    unsigned int flags);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+extern const struct address_space_operations erofs_fscache_access_aops;
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+	return -EOPNOTSUPP;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						     char *name,
+						     unsigned int flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
+{
+}
+#endif
+
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
 
 #endif	/* __EROFS_INTERNAL_H */
-
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 5f8cc7346c69415409e554c34d22f25f2879737f..624b1b89867a0771439caee4fb5489a19e661a26 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "xattr.h"
 
@@ -247,4 +246,3 @@ const struct inode_operations erofs_dir_iops = {
 	.listxattr = erofs_listxattr,
 	.get_acl = erofs_get_acl,
 };
-
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index f31a08d86be893403a6283885fdf73bd2a0ffdd9..8ed69d0fcce0ff6dad3c41a69aa21e3b31b6adc7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021, Alibaba Cloud
  */
 #include <linux/module.h>
 #include <linux/buffer_head.h>
@@ -12,6 +12,7 @@
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/blkdev.h>
 #include "xattr.h"
 
 #define CREATE_TRACE_POINTS
@@ -122,24 +123,124 @@ static bool check_layout_compatibility(struct super_block *sb,
 	return true;
 }
 
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+			     struct erofs_device_info *dif, erofs_off_t *pos)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_fscache *fscache;
+	struct erofs_deviceslot *dis;
+	struct block_device *bdev;
+	void *ptr;
+
+	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	dis = ptr + erofs_blkoff(*pos);
+
+	if (!dif->path) {
+		if (!dis->tag[0]) {
+			erofs_err(sb, "empty device tag @ pos %llu", *pos);
+			return -EINVAL;
+		}
+		dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+		if (!dif->path)
+			return -ENOMEM;
+	}
+
+	if (erofs_is_fscache_mode(sb)) {
+		fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
+		if (IS_ERR(fscache))
+			return PTR_ERR(fscache);
+		dif->fscache = fscache;
+	} else {
+		bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
+					  sb->s_type);
+		if (IS_ERR(bdev))
+			return PTR_ERR(bdev);
+		dif->bdev = bdev;
+	}
+
+	dif->blocks = le32_to_cpu(dis->blocks);
+	dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+	sbi->total_blocks += dif->blocks;
+	*pos += EROFS_DEVT_SLOT_SIZE;
+	return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
+			      struct erofs_super_block *dsb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned int ondisk_extradevs;
+	erofs_off_t pos;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	struct erofs_device_info *dif;
+	int id, err = 0;
+
+	sbi->total_blocks = sbi->primarydevice_blocks;
+	if (!erofs_sb_has_device_table(sbi))
+		ondisk_extradevs = 0;
+	else
+		ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+	if (sbi->devs->extra_devices &&
+	    ondisk_extradevs != sbi->devs->extra_devices) {
+		erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+			  ondisk_extradevs, sbi->devs->extra_devices);
+		return -EINVAL;
+	}
+	if (!ondisk_extradevs)
+		return 0;
+
+	sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+	pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+	down_read(&sbi->devs->rwsem);
+	if (sbi->devs->extra_devices) {
+		idr_for_each_entry(&sbi->devs->tree, dif, id) {
+			err = erofs_init_device(&buf, sb, dif, &pos);
+			if (err)
+				break;
+		}
+	} else {
+		for (id = 0; id < ondisk_extradevs; id++) {
+			dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+			if (!dif) {
+				err = -ENOMEM;
+				break;
+			}
+			err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+			if (err < 0) {
+				kfree(dif);
+				break;
+			}
+			++sbi->devs->extra_devices;
+
+			err = erofs_init_device(&buf, sb, dif, &pos);
+			if (err)
+				break;
+		}
+	}
+	up_read(&sbi->devs->rwsem);
+	erofs_put_metabuf(&buf);
+	return err;
+}
+
 static int erofs_read_superblock(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi;
-	struct page *page;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 	struct erofs_super_block *dsb;
 	unsigned int blkszbits;
 	void *data;
 	int ret;
 
-	page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
-	if (IS_ERR(page)) {
+	data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+	if (IS_ERR(data)) {
 		erofs_err(sb, "cannot read erofs superblock");
-		return PTR_ERR(page);
+		return PTR_ERR(data);
 	}
 
 	sbi = EROFS_SB(sb);
-
-	data = kmap(page);
 	dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
 
 	ret = -EINVAL;
@@ -149,7 +250,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
-	if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+	if (erofs_sb_has_sb_chksum(sbi)) {
 		ret = erofs_superblock_csum_verify(sb, data);
 		if (ret)
 			goto out;
@@ -167,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	if (!check_layout_compatibility(sb, dsb))
 		goto out;
 
-	sbi->blocks = le32_to_cpu(dsb->blocks);
+	sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
 	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
 	sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -188,10 +289,11 @@ static int erofs_read_superblock(struct super_block *sb)
 		ret = -EFSCORRUPTED;
 		goto out;
 	}
-	ret = 0;
+
+	/* handle multiple devices */
+	ret = erofs_scan_devices(sb, dsb);
 out:
-	kunmap(page);
-	put_page(page);
+	erofs_put_metabuf(&buf);
 	return ret;
 }
 
@@ -199,14 +301,14 @@ static int erofs_read_superblock(struct super_block *sb)
 static void erofs_default_options(struct erofs_fs_context *ctx)
 {
 #ifdef CONFIG_EROFS_FS_ZIP
-	ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
-	ctx->max_sync_decompress_pages = 3;
+	ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+	ctx->opt.max_sync_decompress_pages = 3;
 #endif
 #ifdef CONFIG_EROFS_FS_XATTR
-	set_opt(ctx, XATTR_USER);
+	set_opt(&ctx->opt, XATTR_USER);
 #endif
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-	set_opt(ctx, POSIX_ACL);
+	set_opt(&ctx->opt, POSIX_ACL);
 #endif
 }
 
@@ -214,6 +316,9 @@ enum {
 	Opt_user_xattr,
 	Opt_acl,
 	Opt_cache_strategy,
+	Opt_device,
+	Opt_fsid,
+	Opt_domain_id,
 	Opt_err
 };
 
@@ -229,15 +334,19 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_flag_no("acl",		Opt_acl),
 	fsparam_enum("cache_strategy",	Opt_cache_strategy,
 		     erofs_param_cache_strategy),
+	fsparam_string("device",	Opt_device),
+	fsparam_string("fsid",		Opt_fsid),
+	fsparam_string("domain_id",     Opt_domain_id),
 	{}
 };
 
 static int erofs_fc_parse_param(struct fs_context *fc,
 				struct fs_parameter *param)
 {
-	struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+	struct erofs_fs_context *ctx = fc->fs_private;
 	struct fs_parse_result result;
-	int opt;
+	struct erofs_device_info *dif;
+	int opt, ret;
 
 	opt = fs_parse(fc, erofs_fs_parameters, param, &result);
 	if (opt < 0)
@@ -247,9 +356,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 	case Opt_user_xattr:
 #ifdef CONFIG_EROFS_FS_XATTR
 		if (result.boolean)
-			set_opt(ctx, XATTR_USER);
+			set_opt(&ctx->opt, XATTR_USER);
 		else
-			clear_opt(ctx, XATTR_USER);
+			clear_opt(&ctx->opt, XATTR_USER);
 #else
 		errorfc(fc, "{,no}user_xattr options not supported");
 #endif
@@ -257,18 +366,66 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 	case Opt_acl:
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
 		if (result.boolean)
-			set_opt(ctx, POSIX_ACL);
+			set_opt(&ctx->opt, POSIX_ACL);
 		else
-			clear_opt(ctx, POSIX_ACL);
+			clear_opt(&ctx->opt, POSIX_ACL);
 #else
 		errorfc(fc, "{,no}acl options not supported");
 #endif
 		break;
 	case Opt_cache_strategy:
 #ifdef CONFIG_EROFS_FS_ZIP
-		ctx->cache_strategy = result.uint_32;
+		ctx->opt.cache_strategy = result.uint_32;
 #else
 		errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+		break;
+	case Opt_device:
+		dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+		if (!dif)
+			return -ENOMEM;
+		dif->path = kstrdup(param->string, GFP_KERNEL);
+		if (!dif->path) {
+			kfree(dif);
+			return -ENOMEM;
+		}
+		down_write(&ctx->devs->rwsem);
+		ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+		up_write(&ctx->devs->rwsem);
+		if (ret < 0) {
+			kfree(dif->path);
+			kfree(dif);
+			return ret;
+		}
+		++ctx->devs->extra_devices;
+		break;
+	case Opt_fsid:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+		if (!ctx->ondemand_enabled) {
+			errorfc(fc, "fsid option not supported");
+			return -EINVAL;
+		}
+		kfree(ctx->fsid);
+		ctx->fsid = kstrdup(param->string, GFP_KERNEL);
+		if (!ctx->fsid)
+			return -ENOMEM;
+#else
+		errorfc(fc, "fsid option not supported");
+		return -EINVAL;
+#endif
+		break;
+	case Opt_domain_id:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+		if (!ctx->ondemand_enabled) {
+			errorfc(fc, "domain_id option not supported");
+			break;
+		}
+		kfree(ctx->domain_id);
+		ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
+		if (!ctx->domain_id)
+			return -ENOMEM;
+#else
+		errorfc(fc, "domain_id option not supported");
 #endif
 		break;
 	default:
@@ -344,35 +501,53 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	int err;
 
 	sb->s_magic = EROFS_SUPER_MAGIC;
-
-	if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
-		erofs_err(sb, "failed to set erofs blksize");
-		return -EINVAL;
-	}
+	sb->s_flags |= SB_RDONLY | SB_NOATIME;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_op = &erofs_sops;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->opt = ctx->opt;
+	sbi->devs = ctx->devs;
+	ctx->devs = NULL;
+	sbi->fsid = ctx->fsid;
+	ctx->fsid = NULL;
+	sbi->domain_id = ctx->domain_id;
+	ctx->domain_id = NULL;
+
+	if (erofs_is_fscache_mode(sb)) {
+		sb->s_blocksize = EROFS_BLKSIZ;
+		sb->s_blocksize_bits = LOG_BLOCK_SIZE;
+
+		err = erofs_fscache_register_fs(sb);
+		if (err)
+			return err;
+
+		err = super_setup_bdi(sb);
+		if (err)
+			return err;
+	} else {
+		if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+			erofs_err(sb, "failed to set erofs blksize");
+			return -EINVAL;
+		}
+	}
+
 	err = erofs_read_superblock(sb);
 	if (err)
 		return err;
 
-	sb->s_flags |= SB_RDONLY | SB_NOATIME;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_time_gran = 1;
-
-	sb->s_op = &erofs_sops;
 	sb->s_xattr = erofs_xattr_handlers;
 
-	if (test_opt(ctx, POSIX_ACL))
+	if (test_opt(&sbi->opt, POSIX_ACL))
 		sb->s_flags |= SB_POSIXACL;
 	else
 		sb->s_flags &= ~SB_POSIXACL;
 
-	sbi->ctx = *ctx;
-
 #ifdef CONFIG_EROFS_FS_ZIP
 	xa_init(&sbi->managed_pslots);
 #endif
@@ -405,6 +580,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 
 static int erofs_fc_get_tree(struct fs_context *fc)
 {
+	struct erofs_fs_context *ctx = fc->fs_private;
+
+	if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
+		return get_tree_nodev(fc, erofs_fc_fill_super);
+
 	return get_tree_bdev(fc, erofs_fc_fill_super);
 }
 
@@ -416,20 +596,50 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
 
 	DBG_BUGON(!sb_rdonly(sb));
 
-	if (test_opt(ctx, POSIX_ACL))
+	if (ctx->fsid || ctx->domain_id)
+		erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
+
+	if (test_opt(&ctx->opt, POSIX_ACL))
 		fc->sb_flags |= SB_POSIXACL;
 	else
 		fc->sb_flags &= ~SB_POSIXACL;
 
-	sbi->ctx = *ctx;
+	sbi->opt = ctx->opt;
 
 	fc->sb_flags |= SB_RDONLY;
 	return 0;
 }
 
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+	struct erofs_device_info *dif = ptr;
+
+	if (dif->bdev)
+		blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+	erofs_fscache_unregister_cookie(dif->fscache);
+	dif->fscache = NULL;
+	kfree(dif->path);
+	kfree(dif);
+	return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+	if (!devs)
+		return;
+	idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+	idr_destroy(&devs->tree);
+	kfree(devs);
+}
+
 static void erofs_fc_free(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	struct erofs_fs_context *ctx = fc->fs_private;
+
+	erofs_free_dev_context(ctx->devs);
+	kfree(ctx->fsid);
+	kfree(ctx->domain_id);
+	kfree(ctx);
 }
 
 static const struct fs_context_operations erofs_context_ops = {
@@ -439,17 +649,33 @@ static const struct fs_context_operations erofs_context_ops = {
 	.free		= erofs_fc_free,
 };
 
+static inline bool erofs_can_init(void)
+{
+	return READ_ONCE(erofs_enabled) || cachefiles_ondemand_is_enabled();
+}
+
 static int erofs_init_fs_context(struct fs_context *fc)
 {
-	fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
-	if (!fc->fs_private)
-		return -ENOMEM;
+	struct erofs_fs_context *ctx;
 
-	/* set default mount options */
-	erofs_default_options(fc->fs_private);
+	if (!erofs_can_init())
+		return -EOPNOTSUPP;
 
-	fc->ops = &erofs_context_ops;
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+	if (!ctx->devs) {
+		kfree(ctx);
+		return -ENOMEM;
+	}
+	ctx->ondemand_enabled = cachefiles_ondemand_is_enabled();
+	fc->fs_private = ctx;
 
+	idr_init(&ctx->devs->tree);
+	init_rwsem(&ctx->devs->rwsem);
+	erofs_default_options(ctx);
+	fc->ops = &erofs_context_ops;
 	return 0;
 }
 
@@ -463,11 +689,18 @@ static void erofs_kill_sb(struct super_block *sb)
 
 	WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
 
-	kill_block_super(sb);
+	if (erofs_is_fscache_mode(sb))
+		kill_anon_super(sb);
+	else
+		kill_block_super(sb);
 
 	sbi = EROFS_SB(sb);
 	if (!sbi)
 		return;
+	erofs_free_dev_context(sbi->devs);
+	erofs_fscache_unregister_fs(sb);
+	kfree(sbi->fsid);
+	kfree(sbi->domain_id);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -484,6 +717,10 @@ static void erofs_put_super(struct super_block *sb)
 	iput(sbi->managed_cache);
 	sbi->managed_cache = NULL;
 #endif
+	erofs_free_dev_context(sbi->devs);
+	sbi->devs = NULL;
+	erofs_fscache_unregister_cookie(sbi->s_fscache);
+	sbi->s_fscache = NULL;
 }
 
 static struct file_system_type erofs_fs_type = {
@@ -550,11 +787,14 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
-	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	u64 id = 0;
+
+	if (!erofs_is_fscache_mode(sb))
+		id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = sb->s_magic;
 	buf->f_bsize = EROFS_BLKSIZ;
-	buf->f_blocks = sbi->blocks;
+	buf->f_blocks = sbi->total_blocks;
 	buf->f_bfree = buf->f_bavail = 0;
 
 	buf->f_files = ULLONG_MAX;
@@ -569,27 +809,33 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
-	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+	struct erofs_mount_opts *opt __maybe_unused = &sbi->opt;
 
 #ifdef CONFIG_EROFS_FS_XATTR
-	if (test_opt(ctx, XATTR_USER))
+	if (test_opt(opt, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
 	else
 		seq_puts(seq, ",nouser_xattr");
 #endif
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-	if (test_opt(ctx, POSIX_ACL))
+	if (test_opt(opt, POSIX_ACL))
 		seq_puts(seq, ",acl");
 	else
 		seq_puts(seq, ",noacl");
 #endif
 #ifdef CONFIG_EROFS_FS_ZIP
-	if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+	if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
 		seq_puts(seq, ",cache_strategy=disabled");
-	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+	else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
 		seq_puts(seq, ",cache_strategy=readahead");
-	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+	else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
 		seq_puts(seq, ",cache_strategy=readaround");
+#endif
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+	if (sbi->fsid)
+		seq_printf(seq, ",fsid=%s", sbi->fsid);
+	if (sbi->domain_id)
+		seq_printf(seq, ",domain_id=%s", sbi->domain_id);
 #endif
 	return 0;
 }
@@ -608,4 +854,3 @@ module_exit(erofs_module_exit);
 MODULE_DESCRIPTION("Enhanced ROM File System");
 MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
 MODULE_LICENSE("GPL");
-
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
index a72897c86744c057240a2ee6fc6461c5cbe19924..64ceb7270b5c1845ef2a85d18ccdc56755710277 100644
--- a/fs/erofs/tagptr.h
+++ b/fs/erofs/tagptr.h
@@ -1,8 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * A tagged pointer implementation
- *
- * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
  */
 #ifndef __EROFS_FS_TAGPTR_H
 #define __EROFS_FS_TAGPTR_H
@@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
 *ptptr; })
 
 #endif	/* __EROFS_FS_TAGPTR_H */
-
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5c11199d753a6fc665dbee5cb97a2a43158bef98..f22cfa31a3c3b1a1b06beadac2740ed8187f5a62 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "internal.h"
 #include <linux/pagevec.h>
@@ -294,4 +293,3 @@ void erofs_exit_shrinker(void)
 	unregister_shrinker(&erofs_shrinker_info);
 }
 #endif	/* !CONFIG_EROFS_FS_ZIP */
-
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 47314a26767a844fb016b1b8ebe435c014f4a1ea..7c55336959cb49eae1caaecd5e5c40644f569d05 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -2,40 +2,20 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * Copyright (C) 2021-2022, Alibaba Cloud
  */
 #include <linux/security.h>
 #include "xattr.h"
 
 struct xattr_iter {
 	struct super_block *sb;
-	struct page *page;
+	struct erofs_buf buf;
 	void *kaddr;
 
 	erofs_blk_t blkaddr;
 	unsigned int ofs;
 };
 
-static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
-{
-	/* the only user of kunmap() is 'init_inode_xattrs' */
-	if (!atomic)
-		kunmap(it->page);
-	else
-		kunmap_atomic(it->kaddr);
-
-	unlock_page(it->page);
-	put_page(it->page);
-}
-
-static inline void xattr_iter_end_final(struct xattr_iter *it)
-{
-	if (!it->page)
-		return;
-
-	xattr_iter_end(it, true);
-}
-
 static int init_inode_xattrs(struct inode *inode)
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
@@ -44,7 +24,6 @@ static int init_inode_xattrs(struct inode *inode)
 	struct erofs_xattr_ibody_header *ih;
 	struct super_block *sb;
 	struct erofs_sb_info *sbi;
-	bool atomic_map;
 	int ret = 0;
 
 	/* the most case is that xattrs of this inode are initialized. */
@@ -92,26 +71,23 @@ static int init_inode_xattrs(struct inode *inode)
 
 	sb = inode->i_sb;
 	sbi = EROFS_SB(sb);
+	it.buf = __EROFS_BUF_INITIALIZER;
 	it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
 	it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
 
-	it.page = erofs_get_meta_page(sb, it.blkaddr);
-	if (IS_ERR(it.page)) {
-		ret = PTR_ERR(it.page);
+	/* read in shared xattr array (non-atomic, see kmalloc below) */
+	it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
+	if (IS_ERR(it.kaddr)) {
+		ret = PTR_ERR(it.kaddr);
 		goto out_unlock;
 	}
 
-	/* read in shared xattr array (non-atomic, see kmalloc below) */
-	it.kaddr = kmap(it.page);
-	atomic_map = false;
-
 	ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
-
 	vi->xattr_shared_count = ih->h_shared_count;
 	vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
 						sizeof(uint), GFP_KERNEL);
 	if (!vi->xattr_shared_xattrs) {
-		xattr_iter_end(&it, atomic_map);
+		erofs_put_metabuf(&it.buf);
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
@@ -123,25 +99,22 @@ static int init_inode_xattrs(struct inode *inode)
 		if (it.ofs >= EROFS_BLKSIZ) {
 			/* cannot be unaligned */
 			DBG_BUGON(it.ofs != EROFS_BLKSIZ);
-			xattr_iter_end(&it, atomic_map);
 
-			it.page = erofs_get_meta_page(sb, ++it.blkaddr);
-			if (IS_ERR(it.page)) {
+			it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
+						      EROFS_KMAP);
+			if (IS_ERR(it.kaddr)) {
 				kfree(vi->xattr_shared_xattrs);
 				vi->xattr_shared_xattrs = NULL;
-				ret = PTR_ERR(it.page);
+				ret = PTR_ERR(it.kaddr);
 				goto out_unlock;
 			}
-
-			it.kaddr = kmap_atomic(it.page);
-			atomic_map = true;
 			it.ofs = 0;
 		}
 		vi->xattr_shared_xattrs[i] =
 			le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
 		it.ofs += sizeof(__le32);
 	}
-	xattr_iter_end(&it, atomic_map);
+	erofs_put_metabuf(&it.buf);
 
 	/* paired with smp_mb() at the beginning of the function. */
 	smp_mb();
@@ -173,19 +146,11 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
 	if (it->ofs < EROFS_BLKSIZ)
 		return 0;
 
-	xattr_iter_end(it, true);
-
 	it->blkaddr += erofs_blknr(it->ofs);
-
-	it->page = erofs_get_meta_page(it->sb, it->blkaddr);
-	if (IS_ERR(it->page)) {
-		int err = PTR_ERR(it->page);
-
-		it->page = NULL;
-		return err;
-	}
-
-	it->kaddr = kmap_atomic(it->page);
+	it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
+				       EROFS_KMAP_ATOMIC);
+	if (IS_ERR(it->kaddr))
+		return PTR_ERR(it->kaddr);
 	it->ofs = erofs_blkoff(it->ofs);
 	return 0;
 }
@@ -208,11 +173,10 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
 	it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
 	it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
 
-	it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
-	if (IS_ERR(it->page))
-		return PTR_ERR(it->page);
-
-	it->kaddr = kmap_atomic(it->page);
+	it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
+				       EROFS_KMAP_ATOMIC);
+	if (IS_ERR(it->kaddr))
+		return PTR_ERR(it->kaddr);
 	return vi->xattr_isize - xattr_header_sz;
 }
 
@@ -273,7 +237,7 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+		slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
 			      entry.e_name_len - processed);
 
 		/* handle name */
@@ -308,7 +272,7 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+		slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
 			      value_sz - processed);
 		op->value(it, processed, it->kaddr + it->ofs, slice);
 		it->ofs += slice;
@@ -387,8 +351,6 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
 		if (ret != -ENOATTR)
 			break;
 	}
-	xattr_iter_end_final(&it->it);
-
 	return ret ? ret : it->buffer_size;
 }
 
@@ -405,32 +367,22 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
 
 		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
-
-		if (!i || blkaddr != it->it.blkaddr) {
-			if (i)
-				xattr_iter_end(&it->it, true);
-
-			it->it.page = erofs_get_meta_page(sb, blkaddr);
-			if (IS_ERR(it->it.page))
-				return PTR_ERR(it->it.page);
-
-			it->it.kaddr = kmap_atomic(it->it.page);
-			it->it.blkaddr = blkaddr;
-		}
+		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+						  EROFS_KMAP_ATOMIC);
+		if (IS_ERR(it->it.kaddr))
+			return PTR_ERR(it->it.kaddr);
+		it->it.blkaddr = blkaddr;
 
 		ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
 		if (ret != -ENOATTR)
 			break;
 	}
-	if (vi->xattr_shared_count)
-		xattr_iter_end_final(&it->it);
-
 	return ret ? ret : it->buffer_size;
 }
 
 static bool erofs_xattr_user_list(struct dentry *dentry)
 {
-	return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+	return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
 }
 
 static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -453,10 +405,11 @@ int erofs_getxattr(struct inode *inode, int index,
 		return ret;
 
 	it.index = index;
-
 	it.name.len = strlen(name);
 	if (it.name.len > EROFS_NAME_LEN)
 		return -ERANGE;
+
+	it.it.buf = __EROFS_BUF_INITIALIZER;
 	it.name.name = name;
 
 	it.buffer = buffer;
@@ -466,6 +419,7 @@ int erofs_getxattr(struct inode *inode, int index,
 	ret = inline_getxattr(inode, &it);
 	if (ret == -ENOATTR)
 		ret = shared_getxattr(inode, &it);
+	erofs_put_metabuf(&it.it.buf);
 	return ret;
 }
 
@@ -477,7 +431,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
 
 	switch (handler->flags) {
 	case EROFS_XATTR_INDEX_USER:
-		if (!test_opt(&sbi->ctx, XATTR_USER))
+		if (!test_opt(&sbi->opt, XATTR_USER))
 			return -EOPNOTSUPP;
 		break;
 	case EROFS_XATTR_INDEX_TRUSTED:
@@ -608,7 +562,6 @@ static int inline_listxattr(struct listxattr_iter *it)
 		if (ret)
 			break;
 	}
-	xattr_iter_end_final(&it->it);
 	return ret ? ret : it->buffer_ofs;
 }
 
@@ -626,25 +579,16 @@ static int shared_listxattr(struct listxattr_iter *it)
 			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
 
 		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
-		if (!i || blkaddr != it->it.blkaddr) {
-			if (i)
-				xattr_iter_end(&it->it, true);
-
-			it->it.page = erofs_get_meta_page(sb, blkaddr);
-			if (IS_ERR(it->it.page))
-				return PTR_ERR(it->it.page);
-
-			it->it.kaddr = kmap_atomic(it->it.page);
-			it->it.blkaddr = blkaddr;
-		}
+		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+						  EROFS_KMAP_ATOMIC);
+		if (IS_ERR(it->it.kaddr))
+			return PTR_ERR(it->it.kaddr);
+		it->it.blkaddr = blkaddr;
 
 		ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
 		if (ret)
 			break;
 	}
-	if (vi->xattr_shared_count)
-		xattr_iter_end_final(&it->it);
-
 	return ret ? ret : it->buffer_ofs;
 }
 
@@ -660,6 +604,7 @@ ssize_t erofs_listxattr(struct dentry *dentry,
 	if (ret)
 		return ret;
 
+	it.it.buf = __EROFS_BUF_INITIALIZER;
 	it.dentry = dentry;
 	it.buffer = buffer;
 	it.buffer_size = buffer_size;
@@ -668,9 +613,10 @@ ssize_t erofs_listxattr(struct dentry *dentry,
 	it.it.sb = dentry->d_sb;
 
 	ret = inline_listxattr(&it);
-	if (ret < 0 && ret != -ENOATTR)
-		return ret;
-	return shared_listxattr(&it);
+	if (ret >= 0 || ret == -ENOATTR)
+		ret = shared_listxattr(&it);
+	erofs_put_metabuf(&it.it.buf);
+	return ret;
 }
 
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
@@ -709,4 +655,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 #endif
-
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 815304bd335f53286e4b1a826e7373538b270324..50e283d0526be1beb395f2e28863590c7a5759b2 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #ifndef __EROFS_XATTR_H
 #define __EROFS_XATTR_H
@@ -87,4 +86,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type);
 #endif
 
 #endif
-
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index da133950d5144396bf72bf48cf384879fd228aee..c93a511e21fbf135dde1f893325c5cce6b925a41 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "zdata.h"
 #include "compress.h"
@@ -612,7 +611,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 		goto err_out;
 
 	/* preload all compressed pages (maybe downgrade role if necessary) */
-	if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+	if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
 		cache_strategy = DELAYEDALLOC;
 	else
 		cache_strategy = DONTALLOC;
@@ -1163,8 +1162,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
 	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
 	void *bi_private;
 	z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
-	/* since bio will be NULL, no need to initialize last_index */
+	/* bio is NULL initially, so no need to initialize last_{index,bdev} */
 	pgoff_t last_index;
+	struct block_device *last_bdev;
 	unsigned int nr_bios = 0;
 	struct bio *bio = NULL;
 
@@ -1176,6 +1176,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
 	q[JQ_SUBMIT]->head = owned_head;
 
 	do {
+		struct erofs_map_dev mdev;
 		struct z_erofs_pcluster *pcl;
 		pgoff_t cur, end;
 		unsigned int i = 0;
@@ -1187,7 +1188,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
 
 		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
 
-		cur = pcl->obj.index;
+		/* no device id here, thus it will always succeed */
+		mdev = (struct erofs_map_dev) {
+			.m_pa = blknr_to_addr(pcl->obj.index),
+		};
+		(void)erofs_map_dev(sb, &mdev);
+
+		cur = erofs_blknr(mdev.m_pa);
 		end = cur + BIT(pcl->clusterbits);
 
 		/* close the main owned chain at first */
@@ -1203,7 +1210,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
 			if (!page)
 				continue;
 
-			if (bio && cur != last_index + 1) {
+			if (bio && (cur != last_index + 1 ||
+				    last_bdev != mdev.m_bdev)) {
 submit_bio_retry:
 				submit_bio(bio);
 				bio = NULL;
@@ -1211,9 +1219,10 @@ static void z_erofs_submit_queue(struct super_block *sb,
 
 			if (!bio) {
 				bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
 				bio->bi_end_io = z_erofs_decompressqueue_endio;
-				bio_set_dev(bio, sb->s_bdev);
+
+				bio_set_dev(bio, mdev.m_bdev);
+				last_bdev = mdev.m_bdev;
 				bio->bi_iter.bi_sector = (sector_t)cur <<
 					LOG_SECTORS_PER_BLOCK;
 				bio->bi_private = bi_private;
@@ -1294,9 +1303,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
 	if (err)
 		erofs_err(inode->i_sb, "failed to read, err [%d]", err);
 
-	if (f.map.mpage)
-		put_page(f.map.mpage);
-
+	erofs_put_metabuf(&f.map.buf);
 	/* clean up the remaining free pages */
 	put_pages_list(&pagepool);
 	return err;
@@ -1308,7 +1315,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 
 	unsigned int nr_pages = readahead_count(rac);
-	bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
+	bool sync = (nr_pages <= sbi->opt.max_sync_decompress_pages);
 	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
 	struct page *page, *head = NULL;
 	LIST_HEAD(pagepool);
@@ -1350,10 +1357,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	(void)z_erofs_collector_end(&f.clt);
 
 	z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
-	if (f.map.mpage)
-		put_page(f.map.mpage);
-
+	erofs_put_metabuf(&f.map.buf);
 	/* clean up the remaining free pages */
 	put_pages_list(&pagepool);
 }
@@ -1362,4 +1366,3 @@ const struct address_space_operations z_erofs_aops = {
 	.readpage = z_erofs_readpage,
 	.readahead = z_erofs_readahead,
 };
-
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 68c9b29fc0ca51cfd822d352e5d1bf8cd2ccf4d2..68da309d5ad7d452a484d143c1a6ace28f44182a 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #ifndef __EROFS_FS_ZDATA_H
 #define __EROFS_FS_ZDATA_H
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a5537a9f8f36789d1d799ff50841009967bdd98b..840e795a7c70a015daccec18203df0296019a718 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2018-2019 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "internal.h"
 #include <asm/unaligned.h>
@@ -32,7 +31,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	struct super_block *const sb = inode->i_sb;
 	int err;
 	erofs_off_t pos;
-	struct page *page;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 	void *kaddr;
 	struct z_erofs_map_header *h;
 
@@ -56,14 +55,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 
 	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
 		    vi->xattr_isize, 8);
-	page = erofs_get_meta_page(sb, erofs_blknr(pos));
-	if (IS_ERR(page)) {
-		err = PTR_ERR(page);
+	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
+				   EROFS_KMAP_ATOMIC);
+	if (IS_ERR(kaddr)) {
+		err = PTR_ERR(kaddr);
 		goto out_unlock;
 	}
 
-	kaddr = kmap_atomic(page);
-
 	h = kaddr + erofs_blkoff(pos);
 	vi->z_advise = le16_to_cpu(h->h_advise);
 	vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
@@ -93,9 +91,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	smp_mb();
 	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
 unmap_done:
-	kunmap_atomic(kaddr);
-	unlock_page(page);
-	put_page(page);
+	erofs_put_metabuf(&buf);
 out_unlock:
 	clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
 	return err;
@@ -118,31 +114,11 @@ static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
 				  erofs_blk_t eblk)
 {
 	struct super_block *const sb = m->inode->i_sb;
-	struct erofs_map_blocks *const map = m->map;
-	struct page *mpage = map->mpage;
-
-	if (mpage) {
-		if (mpage->index == eblk) {
-			if (!m->kaddr)
-				m->kaddr = kmap_atomic(mpage);
-			return 0;
-		}
 
-		if (m->kaddr) {
-			kunmap_atomic(m->kaddr);
-			m->kaddr = NULL;
-		}
-		put_page(mpage);
-	}
-
-	mpage = erofs_get_meta_page(sb, eblk);
-	if (IS_ERR(mpage)) {
-		map->mpage = NULL;
-		return PTR_ERR(mpage);
-	}
-	m->kaddr = kmap_atomic(mpage);
-	unlock_page(mpage);
-	map->mpage = mpage;
+	m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk,
+				      EROFS_KMAP_ATOMIC);
+	if (IS_ERR(m->kaddr))
+		return PTR_ERR(m->kaddr);
 	return 0;
 }
 
@@ -462,8 +438,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 	map->m_flags |= EROFS_MAP_MAPPED;
 
 unmap_out:
-	if (m.kaddr)
-		kunmap_atomic(m.kaddr);
+	erofs_unmap_metabuf(&m.map->buf);
 
 out:
 	erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
@@ -476,4 +451,3 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 	DBG_BUGON(err < 0 && err != -ENOMEM);
 	return err;
 }
-
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index 52898176ef31da86ecc24edb1327400d5936cd40..b05464f4a80831a9a4f5bdeaed21bd6d5eae4208 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2018 HUAWEI, Inc.
  *             https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #ifndef __EROFS_FS_ZPVEC_H
 #define __EROFS_FS_ZPVEC_H
@@ -158,4 +157,3 @@ z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
 	return tagptr_unfold_ptr(t);
 }
 #endif
-
diff --git a/fs/fs_ctl.c b/fs/fs_ctl.c
new file mode 100644
index 0000000000000000000000000000000000000000..6464c9ba5e18f19ec2cf05f66d06eeef9e7fa93f
--- /dev/null
+++ b/fs/fs_ctl.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2024. Huawei Technologies Co., Ltd */
+
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+#if IS_ENABLED(CONFIG_EROFS_FS) || IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)
+static int param_set_bool_on_only_once(const char *s, const struct kernel_param *kp)
+{
+	int ret;
+	bool value, *res = kp->arg;
+
+	if (!s)
+		s = "1";
+
+	ret = strtobool(s, &value);
+	if (ret)
+		return ret;
+
+	if (!value && *res)
+		return -EBUSY;
+
+	if (value && !*res)
+		WRITE_ONCE(*res, true);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_EROFS_FS)
+bool erofs_enabled;
+EXPORT_SYMBOL(erofs_enabled);
+module_param_call(erofs_enabled, param_set_bool_on_only_once, param_get_bool,
+				  &erofs_enabled, 0644);
+#endif
+
+#if IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)
+bool cachefiles_ondemand_enabled;
+EXPORT_SYMBOL(cachefiles_ondemand_enabled);
+module_param_call(cachefiles_ondemand_enabled, param_set_bool_on_only_once, param_get_bool,
+				  &cachefiles_ondemand_enabled, 0644);
+#endif
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 6104f627cc71242f88845ec4aae945a66c90c072..66a4db1eee3629aace6b5d939551321a98b8c9fa 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -27,32 +27,6 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 static int fscache_attach_object(struct fscache_cookie *cookie,
 				 struct fscache_object *object);
 
-static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
-{
-	struct hlist_node *object;
-	const u8 *k;
-	unsigned loop;
-
-	pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n",
-	       prefix, cookie, cookie->parent, cookie->flags,
-	       atomic_read(&cookie->n_children),
-	       atomic_read(&cookie->n_active));
-	pr_err("%c-cookie d=%p n=%p\n",
-	       prefix, cookie->def, cookie->netfs_data);
-
-	object = READ_ONCE(cookie->backing_objects.first);
-	if (object)
-		pr_err("%c-cookie o=%p\n",
-		       prefix, hlist_entry(object, struct fscache_object, cookie_link));
-
-	pr_err("%c-key=[%u] '", prefix, cookie->key_len);
-	k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
-		cookie->inline_key : cookie->key;
-	for (loop = 0; loop < cookie->key_len; loop++)
-		pr_cont("%02x", k[loop]);
-	pr_cont("'\n");
-}
-
 void fscache_free_cookie(struct fscache_cookie *cookie)
 {
 	if (cookie) {
@@ -65,6 +39,38 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
 	}
 }
 
+static int fscache_set_volume_key_hash(struct fscache_cookie *cookie, u32 *buf)
+{
+	u8 *key;
+	size_t hlen = round_up(1 + cookie->key_len + 1, sizeof(__le32));
+
+	key = kzalloc(hlen, GFP_KERNEL);
+	if (!key)
+		return -ENOMEM;
+
+	key[0] = cookie->key_len;
+	memcpy(key + 1, buf, cookie->key_len);
+	cookie->key_hash = fscache_hash(0, (u32 *)key, hlen / sizeof(__le32));
+	kfree(key);
+
+	return 0;
+}
+
+static int fscache_set_key_hash(struct fscache_cookie *cookie, u32 *buf,
+				int bufs)
+{
+	unsigned int salt = 0;
+
+	if (volume_new_version(cookie))
+		return fscache_set_volume_key_hash(cookie, buf);
+
+	if (data_new_version(cookie))
+		salt = cookie->parent->key_hash;
+
+	cookie->key_hash = fscache_hash(salt, buf, bufs);
+	return 0;
+}
+
 /*
  * Set the index key in a cookie.  The cookie struct has space for a 16-byte
  * key plus length and hash, but if that's not big enough, it's instead a
@@ -76,6 +82,7 @@ static int fscache_set_key(struct fscache_cookie *cookie,
 {
 	u32 *buf;
 	int bufs;
+	int ret;
 
 	bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf));
 
@@ -89,8 +96,12 @@ static int fscache_set_key(struct fscache_cookie *cookie,
 	}
 
 	memcpy(buf, index_key, index_key_len);
-	cookie->key_hash = fscache_hash(0, buf, bufs);
-	return 0;
+	ret = fscache_set_key_hash(cookie, buf, bufs);
+	if (ret && index_key_len > sizeof(cookie->inline_key)) {
+		kfree(cookie->key);
+		cookie->key = NULL;
+	}
+	return ret;
 }
 
 static long fscache_compare_cookie(const struct fscache_cookie *a,
@@ -137,6 +148,9 @@ struct fscache_cookie *fscache_alloc_cookie(
 
 	cookie->key_len = index_key_len;
 	cookie->aux_len = aux_data_len;
+	cookie->def	= def;
+	cookie->parent	= parent;
+	cookie->type	= def->type;
 
 	if (fscache_set_key(cookie, index_key, index_key_len) < 0)
 		goto nomem;
@@ -157,11 +171,9 @@ struct fscache_cookie *fscache_alloc_cookie(
 	 */
 	atomic_set(&cookie->n_active, 1);
 
-	cookie->def		= def;
-	cookie->parent		= parent;
+	cookie->collision	= NULL;
 	cookie->netfs_data	= netfs_data;
 	cookie->flags		= (1 << FSCACHE_COOKIE_NO_DATA_YET);
-	cookie->type		= def->type;
 	spin_lock_init(&cookie->lock);
 	spin_lock_init(&cookie->stores_lock);
 	INIT_HLIST_HEAD(&cookie->backing_objects);
@@ -176,6 +188,27 @@ struct fscache_cookie *fscache_alloc_cookie(
 	return NULL;
 }
 
+static bool fscache_is_acquire_pending(struct fscache_cookie *cookie)
+{
+	return test_bit(FSCACHE_COOKIE_ACQUIRE_PENDING, &cookie->flags);
+}
+
+static int fscache_wait_on_cookie_collision(struct fscache_cookie *candidate)
+{
+	int ret;
+
+	ret = wait_on_bit_timeout_acquire(&candidate->flags, FSCACHE_COOKIE_ACQUIRE_PENDING,
+					  TASK_INTERRUPTIBLE, 20 * HZ);
+	if (ret == -EINTR)
+		return ret;
+	if (fscache_is_acquire_pending(candidate)) {
+		pr_notice("Potential cookie collision!");
+		return wait_on_bit_acquire(&candidate->flags, FSCACHE_COOKIE_ACQUIRE_PENDING,
+					   TASK_INTERRUPTIBLE);
+	}
+	return 0;
+}
+
 /*
  * Attempt to insert the new cookie into the hash.  If there's a collision, we
  * return the old cookie if it's not in use and an error otherwise.
@@ -192,8 +225,13 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
 
 	hlist_bl_lock(h);
 	hlist_bl_for_each_entry(cursor, p, h, hash_link) {
-		if (fscache_compare_cookie(candidate, cursor) == 0)
-			goto collision;
+		if (fscache_compare_cookie(candidate, cursor) == 0) {
+			if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cursor->flags))
+				goto collision;
+			cursor->collision = candidate;
+			set_bit(FSCACHE_COOKIE_ACQUIRE_PENDING, &candidate->flags);
+			break;
+		}
 	}
 
 	__set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags);
@@ -201,16 +239,27 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
 	atomic_inc(&candidate->parent->n_children);
 	hlist_bl_add_head(&candidate->hash_link, h);
 	hlist_bl_unlock(h);
+
+	if (fscache_is_acquire_pending(candidate) &&
+	    fscache_wait_on_cookie_collision(candidate)) {
+		fscache_cookie_put(candidate->parent, fscache_cookie_put_acquire_nobufs);
+		atomic_dec(&candidate->parent->n_children);
+		hlist_bl_lock(h);
+		hlist_bl_del(&candidate->hash_link);
+		if (fscache_is_acquire_pending(candidate))
+			cursor->collision = NULL;
+		hlist_bl_unlock(h);
+		pr_err("Wait duplicate cookie unhashed interrupted\n");
+		return NULL;
+	}
 	return candidate;
 
 collision:
 	if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) {
 		trace_fscache_cookie(cursor, fscache_cookie_collision,
 				     atomic_read(&cursor->usage));
-		pr_err("Duplicate cookie detected\n");
-		fscache_print_cookie(cursor, 'O');
-		fscache_print_cookie(candidate, 'N');
 		hlist_bl_unlock(h);
+		pr_err_ratelimited("Duplicate cookie detected\n");
 		return NULL;
 	}
 
@@ -368,8 +417,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie,
 	}
 
 out_unlock:
-	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
-	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	clear_and_wake_up_bit(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
 }
 EXPORT_SYMBOL(__fscache_enable_cookie);
 
@@ -441,8 +489,8 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
 	/* we may be required to wait for lookup to complete at this point */
 	if (!fscache_defer_lookup) {
 		_debug("non-deferred lookup %p", &cookie->flags);
-		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
-			    TASK_UNINTERRUPTIBLE);
+		wait_on_bit_acquire(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+				    TASK_UNINTERRUPTIBLE);
 		_debug("complete");
 		if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
 			goto unavailable;
@@ -648,7 +696,7 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
 {
 	_enter("%p", cookie);
 
-	wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
+	wait_on_bit_acquire(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
 		    TASK_UNINTERRUPTIBLE);
 
 	_leave("");
@@ -765,8 +813,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie,
 	}
 
 out_unlock_enable:
-	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
-	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	clear_and_wake_up_bit(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
 	_leave("");
 }
 EXPORT_SYMBOL(__fscache_disable_cookie);
@@ -805,7 +852,6 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
 
 	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
-	cookie->def		= NULL;
 	BUG_ON(!radix_tree_empty(&cookie->stores));
 
 	if (cookie->parent) {
@@ -825,16 +871,24 @@ EXPORT_SYMBOL(__fscache_relinquish_cookie);
 /*
  * Remove a cookie from the hash table.
  */
-static void fscache_unhash_cookie(struct fscache_cookie *cookie)
+void fscache_unhash_cookie(struct fscache_cookie *cookie)
 {
 	struct hlist_bl_head *h;
 	unsigned int bucket;
 
+	if (hlist_bl_unhashed(&cookie->hash_link))
+		return;
+
 	bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1);
 	h = &fscache_cookie_hash[bucket];
 
 	hlist_bl_lock(h);
-	hlist_bl_del(&cookie->hash_link);
+	hlist_bl_del_init(&cookie->hash_link);
+	if (cookie->collision) {
+		clear_and_wake_up_bit(FSCACHE_COOKIE_ACQUIRE_PENDING,
+				      &cookie->collision->flags);
+		cookie->collision = NULL;
+	}
 	hlist_bl_unlock(h);
 }
 
@@ -850,9 +904,8 @@ void fscache_cookie_put(struct fscache_cookie *cookie,
 	_enter("%p", cookie);
 
 	do {
+		trace_fscache_cookie(cookie, where, atomic_read(&cookie->usage));
 		usage = atomic_dec_return(&cookie->usage);
-		trace_fscache_cookie(cookie, where, usage);
-
 		if (usage > 0)
 			return;
 		BUG_ON(usage < 0);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 64aa552b296d772688ad92c9a5954b65a371f60a..533c4b4586d8be08c7f2cd8e2328482ac43e510f 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -55,6 +55,7 @@ extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
 extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *);
 extern void fscache_cookie_put(struct fscache_cookie *,
 			       enum fscache_cookie_trace);
+extern void fscache_unhash_cookie(struct fscache_cookie *cookie);
 
 /*
  * fsdef.c
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4207f98e405fd15cb485b31d51fa0cb260e0e2e5..e8c6c70092e076558fcb06165ab913ef36003642 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -39,13 +39,18 @@ MODULE_PARM_DESC(fscache_debug,
 
 struct kobject *fscache_root;
 struct workqueue_struct *fscache_object_wq;
+EXPORT_SYMBOL(fscache_object_wq);
 struct workqueue_struct *fscache_op_wq;
 
 DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
 
 /* these values serve as lower bounds, will be adjusted in fscache_init() */
-static unsigned fscache_object_max_active = 4;
-static unsigned fscache_op_max_active = 2;
+#define FSCACHE_MIN_OBJECT_MAX_ACTIVE 4
+#define FSCACHE_DEF_OBJECT_MAX_ACTIVE 256
+static unsigned int fscache_object_max_active = FSCACHE_DEF_OBJECT_MAX_ACTIVE;
+static unsigned int fscache_op_max_active = FSCACHE_DEF_OBJECT_MAX_ACTIVE / 2;
+static unsigned int fscache_min_object_max_active = FSCACHE_MIN_OBJECT_MAX_ACTIVE;
+static unsigned int fscache_min_op_max_active = FSCACHE_MIN_OBJECT_MAX_ACTIVE / 2;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fscache_sysctl_header;
@@ -54,12 +59,16 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write,
 				     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *min_val = table->extra2;
 	unsigned int *datap = table->data;
 	int ret;
 
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (ret == 0)
+	if (ret == 0) {
+		if (cachefiles_ondemand_is_enabled() && *datap < *min_val)
+			return -EINVAL;
 		workqueue_set_max_active(*wqp, *datap);
+	}
 	return ret;
 }
 
@@ -71,6 +80,7 @@ static struct ctl_table fscache_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= fscache_max_active_sysctl,
 		.extra1		= &fscache_object_wq,
+		.extra2		= &fscache_min_object_max_active,
 	},
 	{
 		.procname	= "operation_max_active",
@@ -79,6 +89,7 @@ static struct ctl_table fscache_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= fscache_max_active_sysctl,
 		.extra1		= &fscache_op_wq,
+		.extra2		= &fscache_min_op_max_active,
 	},
 	{}
 };
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index cb2146e02cd5442637ed50ccacc6071aab30a9dd..f05003bb743c4d6f1a862e0b3f10afa911db8e65 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -382,14 +382,14 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje
 	parent = object->parent;
 	if (!parent) {
 		_leave(" [no parent]");
-		return transit_to(DROP_OBJECT);
+		return transit_to(KILL_OBJECT);
 	}
 
 	_debug("parent: %s of:%lx", parent->state->name, parent->flags);
 
 	if (fscache_object_is_dying(parent)) {
 		_leave(" [bad parent]");
-		return transit_to(DROP_OBJECT);
+		return transit_to(KILL_OBJECT);
 	}
 
 	if (fscache_object_is_available(parent)) {
@@ -411,7 +411,7 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje
 	spin_unlock(&parent->lock);
 	if (!success) {
 		_leave(" [grab failed]");
-		return transit_to(DROP_OBJECT);
+		return transit_to(KILL_OBJECT);
 	}
 
 	/* fscache_acquire_non_index_cookie() uses this
@@ -427,17 +427,9 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje
 static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
 							int event)
 {
-	struct fscache_object *parent = object->parent;
-
 	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	ASSERT(parent != NULL);
-
-	spin_lock(&parent->lock);
-	parent->n_ops++;
-	parent->n_obj_ops++;
 	object->lookup_jif = jiffies;
-	spin_unlock(&parent->lock);
 
 	_leave("");
 	return transit_to(LOOK_UP_OBJECT);
@@ -460,6 +452,12 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object
 	object->oob_table = fscache_osm_lookup_oob;
 
 	ASSERT(parent != NULL);
+
+	spin_lock(&parent->lock);
+	parent->n_ops++;
+	parent->n_obj_ops++;
+	spin_unlock(&parent->lock);
+
 	ASSERTCMP(parent->n_ops, >, 0);
 	ASSERTCMP(parent->n_obj_ops, >, 0);
 
@@ -523,8 +521,7 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
-		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		clear_and_wake_up_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 	}
 	_leave("");
 }
@@ -558,8 +555,7 @@ void fscache_obtained_object(struct fscache_object *object)
 		/* Allow write requests to begin stacking up and read requests
 		 * to begin shovelling data.
 		 */
-		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+		clear_and_wake_up_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 	} else {
 		fscache_stat(&fscache_n_object_created);
 	}
@@ -745,6 +741,9 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
 	cache->ops->drop_object(object);
 	fscache_stat_d(&fscache_n_cop_drop_object);
 
+	if (volume_new_version(cookie) || data_new_version(cookie))
+		fscache_unhash_cookie(cookie);
+
 	/* The parent object wants to know when all it dependents have gone */
 	if (parent) {
 		_debug("release parent OBJ%x {%d}",
@@ -901,12 +900,16 @@ static void fscache_dequeue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
+	if (list_empty(&object->dep_link))
+		goto out;
+
+	spin_lock(&object->parent->lock);
 	if (!list_empty(&object->dep_link)) {
-		spin_lock(&object->parent->lock);
 		list_del_init(&object->dep_link);
-		spin_unlock(&object->parent->lock);
+		fscache_put_object(object, fscache_obj_put_dequeue);
 	}
-
+	spin_unlock(&object->parent->lock);
+out:
 	_leave("");
 }
 
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 26af6fdf15387fa3e39442b7df2c62706a1afff0..0b5e477cf62b82668c189141ff73296f33b6e4e5 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -113,6 +113,8 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 		fscache_stat(&fscache_n_store_vmscan_gone);
 	}
 
+	/* Make sure the delete operation is performed before waking. */
+	smp_mb();
 	wake_up_bit(&cookie->flags, 0);
 	trace_fscache_wake_cookie(cookie);
 	if (xpage)
@@ -171,6 +173,8 @@ static void fscache_end_page_write(struct fscache_object *object,
 			trace_fscache_page(cookie, page, fscache_page_write_end_pend);
 		}
 		spin_unlock(&cookie->stores_lock);
+		/* Make sure the delete operation is performed before waking. */
+		smp_mb();
 		wake_up_bit(&cookie->flags, 0);
 		trace_fscache_wake_cookie(cookie);
 	} else {
@@ -352,8 +356,8 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 	fscache_stat(&fscache_n_retrievals_wait);
 
 	jif = jiffies;
-	if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
-			TASK_INTERRUPTIBLE) != 0) {
+	if (wait_on_bit_acquire(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+				TASK_INTERRUPTIBLE) != 0) {
 		fscache_stat(&fscache_n_retrievals_intr);
 		_leave(" = -ERESTARTSYS");
 		return -ERESTARTSYS;
@@ -383,8 +387,8 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
 	_debug(">>> WT");
 	if (stat_op_waits)
 		fscache_stat(stat_op_waits);
-	if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
-			TASK_INTERRUPTIBLE) != 0) {
+	if (wait_on_bit_acquire(&op->flags, FSCACHE_OP_WAITING,
+				TASK_INTERRUPTIBLE) != 0) {
 		trace_fscache_op(object->cookie, op, fscache_op_signal);
 		ret = fscache_cancel_op(op, false);
 		if (ret == 0)
@@ -392,8 +396,8 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
 
 		/* it's been removed from the pending queue by another party,
 		 * so we should get to run shortly */
-		wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
-			    TASK_UNINTERRUPTIBLE);
+		wait_on_bit_acquire(&op->flags, FSCACHE_OP_WAITING,
+				    TASK_UNINTERRUPTIBLE);
 	}
 	_debug("<<< GO");
 
@@ -430,7 +434,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 				 struct page *page,
 				 fscache_rw_complete_t end_io_func,
 				 void *context,
-				 gfp_t gfp)
+				 gfp_t gfp, loff_t pos)
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
@@ -472,8 +476,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
-	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
-
 	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -493,6 +495,8 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	if (ret < 0)
 		goto error;
 
+	op->offset = pos;
+
 	/* ask the cache to honour the operation */
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
 		fscache_stat(&fscache_n_cop_allocate_page);
@@ -664,6 +668,72 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 }
 EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
 
+int __fscache_prepare_read(struct fscache_cookie *cookie,
+		struct address_space *mapping, pgoff_t index,
+		unsigned int nr_pages, loff_t start_pos,
+		fscache_rw_complete_t term_func, void *context)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	if (hlist_empty(&cookie->backing_objects))
+		return -ENOBUFS;
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(cookie, mapping, term_func, context);
+	if (!op)
+		return -ENOMEM;
+	atomic_set(&op->n_pages, nr_pages);
+	op->offset = start_pos;
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	__fscache_use_cookie(cookie);
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock_dec;
+	spin_unlock(&cookie->lock);
+
+	ret = fscache_wait_for_operation_activation(
+			object, &op->op,
+			__fscache_stat(&fscache_n_retrieval_op_waits),
+			__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto out;
+
+	ret = object->cache->ops->prepare_read(op, index);
+out:
+	fscache_put_retrieval(op);
+	return ret;
+
+nobufs_unlock_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	fscache_put_retrieval(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_prepare_read);
+
 /*
  * allocate a block in the cache on which to store a page
  * - we return:
@@ -922,6 +992,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 			put_page(results[i]);
 	}
 
+	/* Make sure the delete operation is performed before waking. */
+	smp_mb();
 	wake_up_bit(&cookie->flags, 0);
 	trace_fscache_wake_cookie(cookie);
 
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 37363d570b9b27e87b29f6f46a59da54bafb9a8c..da7f0d0a707ce44187f56640a083dd68c5320f34 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -135,4 +135,16 @@ static inline bool test_bit(long nr, const volatile unsigned long *addr)
 	return arch_test_bit(nr, addr);
 }
 
+/**
+ * test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+	return arch_test_bit_acquire(nr, addr);
+}
+
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 7e10c4b50c5db3b4f7fe1a1472a03db0bb2af653..46437282d6d06cca1552fef87a4c16ce02cf6850 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -3,6 +3,7 @@
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 
 #include <asm/types.h>
+#include <asm/barrier.h>
 
 /**
  * __set_bit - Set a bit in memory
@@ -106,4 +107,17 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+/**
+ *  arch_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ *  @nr: bit number to test
+ *  @addr: Address to start counting from
+ */
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+#define test_bit_acquire arch_test_bit_acquire
+
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29222113bcea0575e9c09a08b165acdcb753c6e3..a0ea6b64c45d081297340c0be79453ad10a59df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3733,4 +3733,21 @@ bool generic_atomic_write_valid(loff_t pos, size_t len,
 	return true;
 }
 
+#if IS_ENABLED(CONFIG_EROFS_FS)
+extern bool erofs_enabled;
+#endif
+
+#if IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)
+extern bool cachefiles_ondemand_enabled;
+static inline bool cachefiles_ondemand_is_enabled(void)
+{
+	return READ_ONCE(cachefiles_ondemand_enabled);
+}
+#else
+static inline bool cachefiles_ondemand_is_enabled(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 3f0b19dcfae79bd0818ffbc527ae2e4ce7ea4204..cb8be2d3143c49b984b9d02084a36e483b5fb369 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -34,6 +34,7 @@ enum fscache_obj_ref_trace {
 	fscache_obj_put_enq_dep,
 	fscache_obj_put_queue,
 	fscache_obj_put_work,
+	fscache_obj_put_dequeue,
 	fscache_obj_ref__nr_traces
 };
 
@@ -74,6 +75,7 @@ struct fscache_cache {
 };
 
 extern wait_queue_head_t fscache_cache_cleared_wq;
+extern struct workqueue_struct *fscache_object_wq;
 
 /*
  * operation to be applied to a cache object
@@ -149,6 +151,7 @@ struct fscache_retrieval {
 	struct list_head	to_do;		/* list of things to be done by the backend */
 	unsigned long		start_time;	/* time at which retrieval started */
 	atomic_t		n_pages;	/* number of pages to be retrieved */
+	loff_t			offset;
 };
 
 typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
@@ -160,6 +163,9 @@ typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
 					      unsigned *nr_pages,
 					      gfp_t gfp);
 
+typedef int (*fscache_prepare_read_func_t)(struct fscache_retrieval *op,
+					   pgoff_t index);
+
 /**
  * fscache_get_retrieval - Get an extra reference on a retrieval operation
  * @op: The retrieval operation to get a reference on
@@ -284,6 +290,8 @@ struct fscache_cache_ops {
 	 * the cache */
 	fscache_pages_retrieval_func_t read_or_alloc_pages;
 
+	fscache_prepare_read_func_t prepare_read;
+
 	/* request a backing block for a page be allocated in the cache so that
 	 * it can be written directly */
 	fscache_page_retrieval_func_t allocate_page;
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index a1c928fe98e7cb9f6b0eef78bdbbb134dcdacf4e..d4af91675c3cc2040c52002aa379894c3d4fbfb1 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -70,6 +70,12 @@ struct fscache_cookie_def {
 #define FSCACHE_COOKIE_TYPE_INDEX	0
 #define FSCACHE_COOKIE_TYPE_DATAFILE	1
 
+	/*
+	 * Used for index cookie. If set, the location/xattr of cachefiles
+	 * will be the same as mainline kernel v5.18+.
+	 */
+	bool new_version;
+
 	/* select the cache into which to insert an entry in this index
 	 * - optional
 	 * - should return a cache identifier or NULL to cause the cache to be
@@ -139,6 +145,7 @@ struct fscache_cookie {
 	struct hlist_head		backing_objects; /* object(s) backing this file/index */
 	const struct fscache_cookie_def	*def;		/* definition */
 	struct fscache_cookie		*parent;	/* parent of this entry */
+	struct fscache_cookie		*collision;	/* collision cookie */
 	struct hlist_bl_node		hash_link;	/* Link in hash table */
 	void				*netfs_data;	/* back pointer to netfs */
 	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
@@ -156,6 +163,7 @@ struct fscache_cookie {
 #define FSCACHE_COOKIE_AUX_UPDATED	8	/* T if the auxiliary data was updated */
 #define FSCACHE_COOKIE_ACQUIRED		9	/* T if cookie is in use */
 #define FSCACHE_COOKIE_RELINQUISHING	10	/* T if cookie is being relinquished */
+#define FSCACHE_COOKIE_ACQUIRE_PENDING	11	/* T if cookie is waiting to complete acquisition */
 
 	u8				type;		/* Type of object */
 	u8				key_len;	/* Length of index key */
@@ -204,7 +212,7 @@ extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
 					struct page *,
 					fscache_rw_complete_t,
 					void *,
-					gfp_t);
+					gfp_t, loff_t);
 extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
 					 struct address_space *,
 					 struct list_head *,
@@ -212,6 +220,13 @@ extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
 					 fscache_rw_complete_t,
 					 void *,
 					 gfp_t);
+extern int __fscache_prepare_read(struct fscache_cookie *cookie,
+				  struct address_space *mapping,
+				  pgoff_t index,
+				  unsigned int nr_pages,
+				  loff_t start_pos,
+				  fscache_rw_complete_t term_func,
+				  void *context);
 extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t);
 extern int __fscache_write_page(struct fscache_cookie *, struct page *, loff_t, gfp_t);
 extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
@@ -545,7 +560,21 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 {
 	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
-						    context, gfp);
+						    context, gfp, page_offset(page));
+	else
+		return -ENOBUFS;
+}
+
+static inline
+int fscache_read_or_alloc_page2(struct fscache_cookie *cookie,
+			       struct page *page,
+			       fscache_rw_complete_t end_io_func,
+			       void *context,
+			       gfp_t gfp, loff_t pos)
+{
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
+		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
+						    context, gfp, pos);
 	else
 		return -ENOBUFS;
 }
@@ -602,6 +631,19 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 		return -ENOBUFS;
 }
 
+static inline
+int fscache_prepare_read(struct fscache_cookie *cookie,
+		struct address_space *mapping, pgoff_t index,
+		unsigned int nr_pages, loff_t start_pos,
+		fscache_rw_complete_t term_func, void *context)
+{
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
+		return __fscache_prepare_read(cookie, mapping, index,
+				nr_pages, start_pos, term_func, context);
+	else
+		return -ENOBUFS;
+}
+
 /**
  * fscache_alloc_page - Allocate a block in which to store a page
  * @cookie: The cookie representing the cache object
@@ -840,4 +882,18 @@ void fscache_enable_cookie(struct fscache_cookie *cookie,
 					can_enable, data);
 }
 
+static inline bool volume_new_version(struct fscache_cookie *cookie)
+{
+	return cookie->def && cookie->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	       cookie->def->new_version;
+}
+
+static inline bool data_new_version(struct fscache_cookie *cookie)
+{
+	if (cookie->type != FSCACHE_COOKIE_TYPE_DATAFILE)
+		return false;
+
+	return cookie->parent && volume_new_version(cookie->parent);
+}
+
 #endif /* _LINUX_FSCACHE_H */
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9fe239ba1b94f81c729e6665d6bd4..d31d256ce88597da67563f1217f5ac52ebe96679 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -28,7 +28,9 @@ int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 void wake_up_bit(void *word, int bit);
 int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_bit_acquire(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
+int out_of_line_wait_on_bit_timeout_acquire(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
 extern void __init wait_bit_init(void);
@@ -235,6 +237,29 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
 	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
 }
 
+static inline int
+wait_on_bit_acquire(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_bit_acquire(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_acquire(word, bit,
+					       bit_wait,
+					       mode);
+}
+
+static inline int
+wait_on_bit_timeout_acquire(unsigned long *word, int bit, unsigned mode,
+			    unsigned long timeout)
+{
+	might_sleep();
+	if (!test_bit_acquire(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_timeout_acquire(word, bit,
+						       bit_wait_timeout,
+						       mode, timeout);
+}
+
 extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
 extern void wake_up_var(void *var);
 extern wait_queue_head_t *__var_waitqueue(void *p);
diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h
index 5d9de24cb9c016a137d98394d9fdaaa3738bfe28..05ecaf2245b22f219ab63a96bca2c4304c76e19b 100644
--- a/include/trace/events/cachefiles.h
+++ b/include/trace/events/cachefiles.h
@@ -21,6 +21,10 @@
 enum cachefiles_obj_ref_trace {
 	cachefiles_obj_put_wait_retry = fscache_obj_ref__nr_traces,
 	cachefiles_obj_put_wait_timeo,
+	cachefiles_obj_get_ondemand_fd,
+	cachefiles_obj_put_ondemand_fd,
+	cachefiles_obj_get_read_req,
+	cachefiles_obj_put_read_req,
 	cachefiles_obj_ref__nr_traces
 };
 
@@ -45,7 +49,11 @@ enum cachefiles_obj_ref_trace {
 	EM(fscache_obj_put_queue,		"PUT queue")		\
 	EM(fscache_obj_put_work,		"PUT work")		\
 	EM(cachefiles_obj_put_wait_retry,	"PUT wait_retry")	\
-	E_(cachefiles_obj_put_wait_timeo,	"PUT wait_timeo")
+	EM(cachefiles_obj_put_wait_timeo,	"PUT wait_timeo")	\
+	EM(cachefiles_obj_get_ondemand_fd,	"GET ondemand_fd")	\
+	EM(cachefiles_obj_put_ondemand_fd,	"PUT ondemand_fd")	\
+	EM(cachefiles_obj_get_read_req,		"GET read_req")		\
+	E_(cachefiles_obj_put_read_req,		"PUT read_req")
 
 /*
  * Export enum symbols via userspace.
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index db4f2cec8360629e5d7d19b6208b13a86e4c271c..f02427cb664ca68ea02157007aa250ac76c28868 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -18,8 +18,7 @@ struct erofs_map_blocks;
 		{ 0,		"FILE" },				\
 		{ 1,		"DIR" })
 
-#define show_map_flags(flags) __print_flags(flags, "|",	\
-	{ EROFS_GET_BLOCKS_RAW,	"RAW" })
+#define show_map_flags(flags) __print_flags(flags, "|",	{} )
 
 #define show_mflags(flags) __print_flags(flags, "",	\
 	{ EROFS_MAP_MAPPED,	"M" },			\
@@ -169,7 +168,7 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_enter,
 		  __entry->flags ? show_map_flags(__entry->flags) : "NULL")
 );
 
-DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter,
+DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_enter,
 	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
 		 unsigned flags),
 
@@ -221,7 +220,7 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_exit,
 		  show_mflags(__entry->mflags), __entry->ret)
 );
 
-DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit,
+DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_exit,
 	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
 		 unsigned flags, int ret),
 
diff --git a/include/uapi/linux/cachefiles.h b/include/uapi/linux/cachefiles.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6746a2fe57c474493c01d7e1ddb25e97d9c9b61
--- /dev/null
+++ b/include/uapi/linux/cachefiles.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_CACHEFILES_H
+#define _LINUX_CACHEFILES_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Fscache ensures that the maximum length of cookie key is 255. The volume key
+ * is controlled by netfs, and generally no bigger than 255.
+ */
+#define CACHEFILES_MSG_MAX_SIZE	1024
+
+enum cachefiles_opcode {
+	CACHEFILES_OP_OPEN,
+	CACHEFILES_OP_CLOSE,
+	CACHEFILES_OP_READ,
+};
+
+/*
+ * Message Header
+ *
+ * @msg_id	a unique ID identifying this message
+ * @opcode	message type, CACHEFILE_OP_*
+ * @len		message length, including message header and following data
+ * @object_id	a unique ID identifying a cache file
+ * @data	message type specific payload
+ */
+struct cachefiles_msg {
+	__u32 msg_id;
+	__u32 opcode;
+	__u32 len;
+	__u32 object_id;
+	__u8  data[];
+};
+
+/*
+ * @data contains the volume_key followed directly by the cookie_key. volume_key
+ * is a NUL-terminated string; @volume_key_size indicates the size of the volume
+ * key in bytes (with trailing NUL). cookie_key is a string without trailing
+ * NUL; @cookie_key_size indicates the size of the cookie key in bytes (without
+ * trailing NUL).
+ *
+ * @fd identifies an anon_fd referring to the cache file.
+ */
+struct cachefiles_open {
+	__u32 volume_key_size;
+	__u32 cookie_key_size;
+	__u32 fd;
+	__u32 flags;
+	__u8  data[];
+};
+
+/*
+ * @off		indicates the starting offset of the requested file range
+ * @len		indicates the length of the requested file range
+ */
+struct cachefiles_read {
+	__u64 off;
+	__u64 len;
+};
+
+/*
+ * Reply for READ request
+ * @arg for this ioctl is the @id field of READ request.
+ */
+#define CACHEFILES_IOC_READ_COMPLETE	_IOW(0x98, 1, int)
+
+#endif
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc09592552c630dd1cfb845c6dce737..b795085b0b84a821b579adf2ada1469121097cfe 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -55,6 +55,23 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
 }
 EXPORT_SYMBOL(__wait_on_bit);
 
+static int __sched
+__wait_on_bit_acquire(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+	      wait_bit_action_f *action, unsigned mode)
+{
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+			ret = (*action)(&wbq_entry->key, mode);
+	} while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+
+	finish_wait(wq_head, &wbq_entry->wq_entry);
+
+	return ret;
+}
+
 int __sched out_of_line_wait_on_bit(void *word, int bit,
 				    wait_bit_action_f *action, unsigned mode)
 {
@@ -65,6 +82,29 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
+int __sched out_of_line_wait_on_bit_acquire(void *word, int bit,
+					    wait_bit_action_f *action, unsigned mode)
+{
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+	return __wait_on_bit_acquire(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_acquire);
+
+int __sched out_of_line_wait_on_bit_timeout_acquire(
+	void *word, int bit, wait_bit_action_f *action,
+	unsigned mode, unsigned long timeout)
+{
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+	wq_entry.key.timeout = jiffies + timeout;
+
+	return __wait_on_bit_acquire(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout_acquire);
+
 int __sched out_of_line_wait_on_bit_timeout(
 	void *word, int bit, wait_bit_action_f *action,
 	unsigned mode, unsigned long timeout)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index cbc69152523636015d6110379167c0784de7eb9b..28145bdf6f3fd06f87155292d2cf5e610b4eadd6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1059,6 +1059,7 @@ void radix_tree_iter_tag_clear(struct radix_tree_root *root,
 {
 	node_tag_clear(root, iter->node, tag, iter_offset(iter));
 }
+EXPORT_SYMBOL(radix_tree_iter_tag_clear);
 
 /**
  * radix_tree_tag_get - get a tag on a radix tree node