diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0da9232ea1754b862455fe6d2d1c0e57398cfde1..6f2bc78f6184c84d09b52d1520c86cfbf3bc956e 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -71,6 +71,20 @@ void dax_remove_host(struct gendisk *disk)
 }
 EXPORT_SYMBOL_GPL(dax_remove_host);
 
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
+{
+	sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
+	phys_addr_t phys_off = (start_sect + sector) * 512;
+
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
 /**
  * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
  * @bdev: block device to find a dax_device for
diff --git a/fs/dax.c b/fs/dax.c
index 3380b43cb6bbbd1289901a3e29edc903abe6156e..544a4c3c1d6277e0a9c06c3085ae378bb1c06918 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -322,23 +322,34 @@ static unsigned long dax_end_pfn(void *entry)
 
 static inline bool dax_page_is_shared(struct page *page)
 {
-	return page->mapping == PAGE_MAPPING_DAX_SHARED;
+	return (unsigned long)READ_ONCE(page->mapping) & PAGE_MAPPING_DAX_SHARED;
 }
 
 /*
  * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
  * refcount.
  */
-static inline void dax_page_share_get(struct page *page)
+static inline void dax_page_share_get(struct page *page,
+			struct address_space *mapping, pgoff_t index)
 {
-	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
+	struct address_space *oldmapping = READ_ONCE(page->mapping);
+
+	if (!((unsigned long)oldmapping & PAGE_MAPPING_DAX_SHARED)) {
 		/*
 		 * Reset the index if the page was already mapped
 		 * regularly before.
 		 */
-		if (page->mapping)
+		if (oldmapping)
 			page->share = 1;
-		page->mapping = PAGE_MAPPING_DAX_SHARED;
+
+		if (test_bit(AS_FSDAX_NORMAP, &mapping->flags)) {
+			/* Note that we (ab)use page->private to keep index for now */
+			WRITE_ONCE(page->private, index);
+			/* paired with smp_mb() in xfs_dax_notify_ddev_failure2() */
+			smp_mb();
+		}
+		WRITE_ONCE(page->mapping,
+			   (void *)((unsigned long)mapping | PAGE_MAPPING_DAX_SHARED));
 	}
 	page->share++;
 }
@@ -367,7 +378,7 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 		struct page *page = pfn_to_page(pfn);
 
 		if (shared) {
-			dax_page_share_get(page);
+			dax_page_share_get(page, mapping, index);
 		} else {
 			WARN_ON_ONCE(page->mapping);
 			page->mapping = mapping;
@@ -1063,6 +1074,46 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
+int dax_copy_range(struct block_device *bdev, struct dax_device *dax_dev,
+		   u64 src_addr, u64 dst_addr, size_t size)
+{
+	const sector_t src_sector = src_addr >> SECTOR_SHIFT;
+	const sector_t dst_sector = dst_addr >> SECTOR_SHIFT;
+	pgoff_t spgoff, dpgoff;
+	int id, rc;
+	long length;
+	void *saddr, *daddr;
+
+	rc = bdev_dax_pgoff(bdev, src_sector, size, &spgoff);
+	if (rc)
+		return rc;
+
+	rc = bdev_dax_pgoff(bdev, dst_sector, size, &dpgoff);
+	if (rc)
+		return rc;
+
+	id = dax_read_lock();
+	length = dax_direct_access(dax_dev, spgoff, PHYS_PFN(size), DAX_ACCESS,
+				   &saddr, NULL);
+	if (length < 0) {
+		rc = length;
+		goto out;
+	}
+
+	length = dax_direct_access(dax_dev, dpgoff, PHYS_PFN(size), DAX_ACCESS,
+				   &daddr, NULL);
+	if (length < 0) {
+		rc = length;
+		goto out;
+	}
+
+	rc = copy_mc_to_kernel(daddr, saddr, size);
+out:
+	dax_read_unlock(id);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(dax_copy_range);
+
 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
 		size_t size, void **kaddr, pfn_t *pfnp)
 {
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 87ae4f0dc3aa01c6099ef2fa7a66b5d84bdb9703..2fdc13c901ef30cc0186ffa7cf875c307075846d 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -315,10 +315,18 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	if (!same_inode)
 		inode_dio_wait(inode_out);
 
-	ret = filemap_write_and_wait_range(inode_in->i_mapping,
-			pos_in, pos_in + *len - 1);
-	if (ret)
-		return ret;
+	if (remap_flags & REMAP_FILE_FAST_REFLINK) {
+		ret = fast_reflink_apply(inode_in->i_mapping,
+					 pos_in >> PAGE_SHIFT,
+					 (pos_in + *len - 1) >> PAGE_SHIFT);
+		if (ret)
+			return ret;
+	} else {
+		ret = filemap_write_and_wait_range(inode_in->i_mapping,
+				pos_in, pos_in + *len - 1);
+		if (ret)
+			return ret;
+	}
 
 	ret = filemap_write_and_wait_range(inode_out->i_mapping,
 			pos_out, pos_out + *len - 1);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6360073865dbc3136e3655e1b493a53ca4310dd3..7a38c22787bcd5f548db439244345f34e0cf1565 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -766,6 +766,11 @@ struct xfs_scrub_metadata {
 #  define XFS_XATTR_LIST_MAX 65536
 #endif
 
+enum {
+	XFS_REFLINK_NORMAL      = 0,
+	XFS_REFLINK_PRIMARY     = (1 << 0),
+	XFS_REFLINK_SECONDARY   = (1 << 1),
+};
 
 /*
  * ioctl commands that are used by Linux filesystems
@@ -840,6 +845,10 @@ struct xfs_scrub_metadata {
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
+#define	XFS_IOC_SET_REFLINK_FLAGS	_IOW('X', 200, uint32_t)
+#define	XFS_IOC_GET_REFLINK_FLAGS	_IOR('X', 201, uint32_t)
+#define	XFS_IOC_WAIT_REFLINK_SECONDARY	_IOW('X', 202, uint32_t)
+
 #ifndef HAVE_BBMACROS
 /*
  * Block I/O parameterization.	A basic block (BB) is the lowest size of
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f462ffe33c232739af6c4762758a0..8b73002067830056bf57a40c90ab2a6e39de89f6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -877,9 +877,18 @@ xfs_break_dax_layouts(
 	struct inode		*inode,
 	bool			*retry)
 {
+	struct xfs_inode	*ip = XFS_I(inode);
 	struct page		*page;
 
-	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+	/*
+	 * For inodes flagged with XFS_REFLINK_{PRIMARY, SECONDARY}, users
+	 * can ensure there are no inflight dio operations on these inodes,
+	 * so we can bypass xfs_break_dax_layouts(BREAK_UNMAP) safely.
+	 */
+	if (ip->i_reflink_flags & (XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY))
+		return 0;
 
 	page = dax_layout_busy_page(inode->i_mapping);
 	if (!page)
@@ -1216,6 +1225,19 @@ xfs_file_remap_range(
 
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
+
+	if (remapped && (src->i_reflink_flags & XFS_REFLINK_PRIMARY)) {
+		mutex_lock(&mp->m_reflink_opt_lock);
+		src->i_reflink_opt_ip = dest;
+		dest->i_reflink_opt_ip = src;
+		mutex_unlock(&mp->m_reflink_opt_lock);
+
+		if (!xfs_has_rmapbt(mp)) {
+			set_bit(AS_FSDAX_NORMAP, &VFS_I(src)->i_mapping->flags);
+			set_bit(AS_FSDAX_NORMAP, &VFS_I(dest)->i_mapping->flags);
+		}
+	}
+
 out_unlock:
 	xfs_iunlock2_remapping(src, dest);
 	if (ret)
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 9edc1f2bc9399eca73414ffa2f6da89db7f59964..afeeef53a7b026c1aa1436cebbc2cf191884658c 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -30,6 +30,7 @@ xfs_param_t xfs_params = {
 	.inherit_nodfrg	= {	0,		1,		1	},
 	.fstrm_timer	= {	1,		30*100,		3600*100},
 	.blockgc_timer	= {	1,		300,		3600*24},
+	.reflink_inactive_force_log_period = {	0,	5,		1000 },
 };
 
 struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3c210ac8371368bc00a77fb9df52315b710d17ce..090c8a1a68c71a814d0049fae70f2ba6bcfabe55 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -107,6 +107,8 @@ xfs_inode_alloc(
 	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
 	ip->i_nblocks = 0;
 	ip->i_forkoff = 0;
+	ip->i_reflink_flags = 0;
+	ip->i_reflink_opt_ip = NULL;
 	ip->i_sick = 0;
 	ip->i_checked = 0;
 	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
@@ -114,7 +116,7 @@ xfs_inode_alloc(
 	spin_lock_init(&ip->i_ioend_lock);
 	ip->i_next_unlinked = NULLAGINO;
 	ip->i_prev_unlinked = 0;
-
+	INIT_LIST_HEAD(&ip->i_reflink_opt_gclist);
 	return ip;
 }
 
@@ -385,6 +387,8 @@ xfs_iget_recycle(
 	 */
 	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 	ip->i_flags |= XFS_INEW;
+	ip->i_reflink_flags = 0;
+	ip->i_reflink_opt_ip = NULL;
 	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
 			XFS_ICI_RECLAIM_TAG);
 	inode->i_state = I_NEW;
@@ -1824,7 +1828,7 @@ xfs_inodegc_set_reclaimable(
  * This is the last chance to make changes to an otherwise unreferenced file
  * before incore reclamation happens.
  */
-static int
+int
 xfs_inodegc_inactivate(
 	struct xfs_inode	*ip)
 {
@@ -1837,6 +1841,40 @@ xfs_inodegc_inactivate(
 
 }
 
+void
+xfs_inodegc_reflink_opt_worker(
+	struct work_struct	*work)
+{
+	struct xfs_mount	*mp = container_of(work, struct xfs_mount,
+						   m_reflink_opt_gcwork);
+	struct xfs_inode	*ip;
+
+	while (1) {
+		spin_lock(&mp->m_reflink_opt_gclock);
+		/*
+		 * fg ioctl can handle a specific inode too.  In that case,
+		 * we will not see such inode on the list anymore.
+		 */
+		if (list_empty(&mp->m_reflink_opt_gclist)) {
+			spin_unlock(&mp->m_reflink_opt_gclock);
+			break;
+		}
+		ip = list_first_entry(&mp->m_reflink_opt_gclist,
+				struct xfs_inode, i_reflink_opt_gclist);
+		/*
+		 * Or we detach it ourselves in the gclock, in that case,
+		 * fg ioctl will hit list_empty (fg ioctl also check
+		 * list_empty under gclock).
+		 */
+		list_del_init(&ip->i_reflink_opt_gclist);
+		spin_unlock(&mp->m_reflink_opt_gclock);
+
+		ASSERT(ip->i_flags & XFS_NEED_INACTIVE);
+		xfs_iflags_set(ip, XFS_INACTIVATING);
+		xfs_inodegc_inactivate(ip);
+	}
+}
+
 void
 xfs_inodegc_worker(
 	struct work_struct	*work)
@@ -2069,6 +2107,22 @@ xfs_inodegc_queue(
 	unsigned long		queue_delay = 1;
 
 	trace_xfs_inode_set_need_inactive(ip);
+
+	if ((ip->i_reflink_flags & XFS_REFLINK_SECONDARY) &&
+	    /* ip->i_reflink_opt_ip won't be changed here since we're the owner */
+	    READ_ONCE(ip->i_reflink_opt_ip)) {
+		/* gclist will be attached before marking XFS_NEED_INACTIVE */
+		spin_lock(&mp->m_reflink_opt_gclock);
+		list_add_tail(&ip->i_reflink_opt_gclist,
+			      &mp->m_reflink_opt_gclist);
+		queue_work(mp->m_inodegc_wq,
+			   &mp->m_reflink_opt_gcwork);
+		spin_unlock(&mp->m_reflink_opt_gclock);
+		wake_up_all(&mp->m_reflink_opt_wait);
+		xfs_iflags_set(ip, XFS_NEED_INACTIVE);
+		return;
+	}
+
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags |= XFS_NEED_INACTIVE;
 	spin_unlock(&ip->i_flags_lock);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 905944dafbe539245bc30fe2df2d8681af0c5a04..6646eb2a7654af8ebbfff7ea83328fa8d5ea6b8f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -70,10 +70,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
 
+int xfs_inodegc_inactivate(struct xfs_inode *ip);
 void xfs_blockgc_worker(struct work_struct *work);
 void xfs_blockgc_stop(struct xfs_mount *mp);
 void xfs_blockgc_start(struct xfs_mount *mp);
 
+void xfs_inodegc_reflink_opt_worker(struct work_struct *work);
 void xfs_inodegc_worker(struct work_struct *work);
 void xfs_inodegc_push(struct xfs_mount *mp);
 int xfs_inodegc_flush(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f9d29acd72b9eea15fd108420d5d86a839696cce..c7af345bd58e33b1dcf15e8077814aaed180461d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1341,6 +1341,8 @@ xfs_itruncate_extents_flags(
 	xfs_fileoff_t		first_unmap_block;
 	xfs_filblks_t		unmap_len;
 	int			error = 0;
+	bool			secondary_inactive = false;
+	int			force_count = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1371,9 +1373,13 @@ xfs_itruncate_extents_flags(
 		return 0;
 	}
 
+	if (!new_size && (ip->i_reflink_flags & XFS_REFLINK_SECONDARY))
+		secondary_inactive = true;
+
 	unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
 	while (unmap_len > 0) {
 		ASSERT(tp->t_highest_agno == NULLAGNUMBER);
+
 		error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
 				flags, XFS_ITRUNC_MAX_EXTENTS);
 		if (error)
@@ -1383,6 +1389,14 @@ xfs_itruncate_extents_flags(
 		error = xfs_defer_finish(&tp);
 		if (error)
 			goto out;
+
+		if (secondary_inactive) {
+			if (xfs_reflink_inactive_force_log_period &&
+			    ++force_count >= xfs_reflink_inactive_force_log_period) {
+				xfs_log_force(mp, 0);
+				force_count = 0;
+			}
+		}
 	}
 
 	if (whichfork == XFS_DATA_FORK) {
@@ -1685,6 +1699,33 @@ xfs_inode_needs_inactive(
 	return xfs_can_free_eofblocks(ip, true);
 }
 
+STATIC void
+xfs_reflink_opt_disconnect(
+	struct xfs_mount        *mp,
+	struct xfs_inode        *ip,
+	bool                    unexpected)
+{
+	bool valid = false;
+
+	if (!(ip->i_reflink_flags & (XFS_REFLINK_PRIMARY |
+				XFS_REFLINK_SECONDARY)))
+		return;
+
+	mutex_lock(&mp->m_reflink_opt_lock);
+	if (ip->i_reflink_opt_ip) {
+		ip->i_reflink_opt_ip->i_reflink_opt_ip = NULL;
+		ip->i_reflink_opt_ip = NULL;
+		valid = true;
+	}
+	mutex_unlock(&mp->m_reflink_opt_lock);
+	if (valid) {
+		wake_up_all(&mp->m_reflink_opt_wait);
+		if (unexpected)
+			xfs_warn(mp, "unexpectedly, inactive reflink file in advance %llu",
+				 ip->i_ino);
+	}
+}
+
 /*
  * xfs_inactive
  *
@@ -1741,6 +1782,7 @@ xfs_inactive(
 		if (xfs_can_free_eofblocks(ip, true))
 			error = xfs_free_eofblocks(ip);
 
+		xfs_reflink_opt_disconnect(mp, ip, true);
 		goto out;
 	}
 
@@ -1772,6 +1814,8 @@ xfs_inactive(
 	if (error)
 		goto out;
 
+	xfs_reflink_opt_disconnect(mp, ip, false);
+
 	/*
 	 * If there are attributes associated with the file then blow them away
 	 * now.  The code calls a routine that recursively deconstructs the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3beb470f18920d6730b32e5d1edcf5530b93b327..b99e6433b27645d3622c6059906b1af3df45ba54 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -85,6 +85,15 @@ typedef struct xfs_inode {
 	 */
 	xfs_agino_t		i_prev_unlinked;
 
+	/* flags for controlling reflink cow behavior */
+	uint32_t		i_reflink_flags;
+	/*
+	 * Saved reflink ip for the sake of quick unshare, currently we
+	 * only support one reflink file under flag XFS_REFLINK_PRIMARY
+	 */
+	struct xfs_inode	*i_reflink_opt_ip;
+	struct list_head        i_reflink_opt_gclist;
+
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 535f6d38cdb54032c9e90cdde35ada7094a0e515..0ad3844f4ff22ab4c77e1e4dbcf041d10e303029 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1880,6 +1880,93 @@ xfs_fs_eofblocks_from_user(
 #define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
 #define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
 
+static bool
+xfs_need_wait_reflink_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_inode *sip;
+
+	mutex_lock(&mp->m_reflink_opt_lock);
+	sip = ip->i_reflink_opt_ip;
+	if (!sip /* pair nolonger valid */ ||
+	    (READ_ONCE(sip->i_flags) & XFS_NEED_INACTIVE) /* retry now */) {
+		mutex_unlock(&mp->m_reflink_opt_lock);
+		return false;
+	}
+	mutex_unlock(&mp->m_reflink_opt_lock);
+	return true;
+}
+
+int
+xfs_wait_reflink_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	u32			timeout_sec)
+{
+	struct xfs_inode *sip;
+	unsigned long expire = 0;
+
+	if (!(ip->i_reflink_flags & XFS_REFLINK_PRIMARY))
+		return -EINVAL;
+	if (timeout_sec)
+		expire = jiffies + HZ * timeout_sec;
+retry:
+	mutex_lock(&mp->m_reflink_opt_lock);
+	sip = ip->i_reflink_opt_ip;
+	if (!sip) {
+		mutex_unlock(&mp->m_reflink_opt_lock);
+		return 0;
+	}
+	spin_lock(&sip->i_flags_lock);
+	/*
+	 * We need to consider if this inode needs to be inactive
+	 * immediately here.
+	 */
+	/* already inactivating now by others? */
+	if ((sip->i_flags & XFS_INACTIVATING) ||
+		/* the inode isn't reclaimable (active or race). */
+		!(sip->i_flags & (XFS_NEED_INACTIVE | XFS_INACTIVATING))) {
+		spin_unlock(&sip->i_flags_lock);
+		mutex_unlock(&mp->m_reflink_opt_lock);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (timeout_sec) {
+			if (time_after(jiffies, expire))
+				return -ETIMEDOUT;
+			wait_event_killable_timeout(mp->m_reflink_opt_wait,
+				!xfs_need_wait_reflink_secondary(mp, ip),
+				HZ * timeout_sec);
+		} else {
+			wait_event_killable(mp->m_reflink_opt_wait,
+				!xfs_need_wait_reflink_secondary(mp, ip));
+		}
+		goto retry;
+	}
+	spin_unlock(&sip->i_flags_lock);
+
+	/*
+	 * gcwork is already on the list since XFS_NEED_INACTIVE is
+	 * set afterwards, let's try to drop this from gcwork list.
+	 */
+	spin_lock(&mp->m_reflink_opt_gclock);
+	/* if the bg kworker decides to handle instead, list_empty will be hit */
+	if (list_empty(&sip->i_reflink_opt_gclist)) {
+		spin_unlock(&mp->m_reflink_opt_gclock);
+		mutex_unlock(&mp->m_reflink_opt_lock);
+		goto retry;
+	}
+	list_del_init(&sip->i_reflink_opt_gclist);
+	spin_unlock(&mp->m_reflink_opt_gclock);
+	mutex_unlock(&mp->m_reflink_opt_lock);
+
+	/* XFS_NEED_INACTIVE will be stable here. */
+	ASSERT(sip->i_flags & XFS_NEED_INACTIVE);
+	xfs_iflags_set(sip, XFS_INACTIVATING);
+	xfs_inodegc_inactivate(sip);
+	return 0;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -2159,6 +2246,48 @@ xfs_file_ioctl(
 		return error;
 	}
 
+	case XFS_IOC_SET_REFLINK_FLAGS: {
+		uint32_t in;
+
+		if (get_user(in, (uint32_t __user *)arg))
+			return -EFAULT;
+
+		/* invalid values */
+		if ((in & ~(XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) ||
+		    (in & (XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) ==
+			(XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY))
+			return -EINVAL;
+
+		/* clearing all flags is unallowed */
+		if (!in)
+			return -EINVAL;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (!ip->i_reflink_flags) {
+			ip->i_reflink_flags = in;
+		} else if (ip->i_reflink_flags != in) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			return -EINVAL;
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return 0;
+	}
+
+	case XFS_IOC_GET_REFLINK_FLAGS: {
+		if (put_user(ip->i_reflink_flags, (uint32_t __user *)arg))
+			return -EFAULT;
+		return 0;
+	}
+
+	case XFS_IOC_WAIT_REFLINK_SECONDARY: {
+		u32 timeout_sec;
+
+		if (get_user(timeout_sec, (uint32_t __user *)arg))
+			return -EFAULT;
+
+		return xfs_wait_reflink_secondary(mp, ip, timeout_sec);
+	}
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 38be600b5e1e8391c52f0fdf0db69ff6f498125a..f74bb55133d944687b8db1d760145d491f522ae6 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -69,4 +69,6 @@ int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
 			   const struct xfs_bulkstat *bstat);
 int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
 
+int xfs_wait_reflink_secondary(struct xfs_mount *mp, struct xfs_inode *ip, u32 timeout_sec);
+
 #endif
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe4e673c439e504bc7fc24ca503dbe..41828f4e5d7c31ccefcb9297ced6e1a291c87884 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -103,6 +103,7 @@ typedef __u32			xfs_nlink_t;
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
 #define xfs_blockgc_secs	xfs_params.blockgc_timer.val
+#define xfs_reflink_inactive_force_log_period	xfs_params.reflink_inactive_force_log_period.val
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_set_flags_nested(sp, f)		\
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3a701786c3e5e9871f086ac3f290c..348d2eb9f649826a3e2f888256893b2365e0ac4b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -250,6 +250,12 @@ typedef struct xfs_mount {
 
 	/* cpus that have inodes queued for inactivation */
 	struct cpumask		m_inodegc_cpumask;
+
+	struct mutex		m_reflink_opt_lock;
+	spinlock_t		m_reflink_opt_gclock;
+	struct list_head	m_reflink_opt_gclist;
+	struct work_struct	m_reflink_opt_gcwork;
+	struct wait_queue_head	m_reflink_opt_wait;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00fe758039b0811bfc9568b28dd1466..30655551dc756bf6db32a64d00c0d6bb3392146b 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
 
 #include <linux/mm.h>
 #include <linux/dax.h>
+#include <linux/pfn_t.h>
 
 struct xfs_failure_info {
 	xfs_agblock_t		startblock;
@@ -173,6 +174,128 @@ xfs_dax_notify_ddev_failure(
 	return error;
 }
 
+static int
+xfs_mf_dax_kill_procs(
+	struct xfs_mount	*mp,
+	struct address_space	*mapping,
+	pgoff_t			pgoff,
+	unsigned long		nrpages,
+	int			mf_flags,
+	bool			share)
+{
+	int rc, rc2 = 0;
+
+	if (share) {
+		struct xfs_inode *ip = XFS_I(mapping->host);
+
+		mutex_lock(&mp->m_reflink_opt_lock);
+		if (ip->i_reflink_opt_ip) {
+			rc2 = mf_dax_kill_procs(VFS_I(ip->i_reflink_opt_ip)->i_mapping,
+						pgoff, nrpages, mf_flags);
+		} else {
+			xfs_warn(mp, "this mode should be only used with REFLINK_PRIMARY|REFLINK_SECONDARY @ ino %llu",
+				 ip->i_ino);
+		}
+		mutex_unlock(&mp->m_reflink_opt_lock);
+	}
+	rc = mf_dax_kill_procs(mapping, pgoff, nrpages, mf_flags);
+	iput(mapping->host);
+	return rc ? rc : rc2;
+}
+
+static int
+xfs_dax_notify_ddev_failure2(
+	struct dax_device	*dax_dev,
+	struct xfs_mount	*mp,
+	loff_t			pos,
+	size_t			size,
+	int			mf_flags)
+{
+	struct address_space *lmapping = NULL;
+	bool lshare = false;
+	pfn_t pfn;
+	pgoff_t pgoff, lpgoff;
+	unsigned long nrpages;
+	long length;
+	int rc, id;
+
+	rc = bdev_dax_pgoff(mp->m_ddev_targp->bt_bdev, pos >> SECTOR_SHIFT,
+			    size, &pgoff);
+	if (rc)
+		return rc;
+	id = dax_read_lock();
+	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS,
+				   NULL, &pfn);
+	if (length < 0) {
+		rc = length;
+		goto out;
+	}
+
+	if (PFN_PHYS(length) < size) {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = 0;
+	while (length) {
+		struct page *page;
+		struct address_space *mapping;
+		bool share = false;
+
+		page = pfn_t_to_page(pfn);
+		pfn.val++;
+		--length;
+
+retry:
+		rcu_read_lock();
+		mapping = page ? READ_ONCE(page->mapping) : NULL;
+		if (mapping) {
+			share = (unsigned long)mapping & PAGE_MAPPING_DAX_SHARED;
+			mapping = (void *)((unsigned long)mapping & ~PAGE_MAPPING_DAX_SHARED);
+			if (!igrab(mapping->host)) {
+				rcu_read_unlock();
+				goto retry;
+			}
+			/* paired with smp_mb() in dax_page_share_get() to ensure valid index */
+			smp_mb();
+			if (!share) {
+				pgoff = READ_ONCE(page->index);
+			} else {
+				WARN_ON(!test_bit(AS_FSDAX_NORMAP, &mapping->flags));
+				pgoff = READ_ONCE(page->private);
+			}
+		}
+		rcu_read_unlock();
+
+		if (lmapping) {
+			if (mapping != lmapping || share != lshare ||
+			    lpgoff + nrpages != pgoff) {
+				rc = xfs_mf_dax_kill_procs(mp, lmapping, lpgoff,
+							   nrpages, mf_flags, lshare);
+				if (rc)
+					break;
+			} else {
+				nrpages++;
+				continue;
+			}
+		}
+		lmapping = mapping;
+		lpgoff = pgoff;
+		lshare = share;
+		nrpages = 1;
+	}
+
+	if (lmapping) {
+		int rc2;
+
+		rc2 = xfs_mf_dax_kill_procs(mp, lmapping, lpgoff, nrpages, mf_flags, lshare);
+		if (!rc)
+			rc = rc2;
+	}
+out:
+	dax_read_unlock(id);
+	return rc;
+}
+
 static int
 xfs_dax_notify_failure(
 	struct dax_device	*dax_dev,
@@ -202,11 +325,6 @@ xfs_dax_notify_failure(
 		return -EFSCORRUPTED;
 	}
 
-	if (!xfs_has_rmapbt(mp)) {
-		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
-		return -EOPNOTSUPP;
-	}
-
 	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
 	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
 
@@ -226,6 +344,9 @@ xfs_dax_notify_failure(
 	if (offset + len - 1 > ddev_end)
 		len = ddev_end - offset + 1;
 
+	if (!xfs_has_rmapbt(mp))
+		return xfs_dax_notify_ddev_failure2(dax_dev, mp, offset, len,
+						   mf_flags);
 	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
 			mf_flags);
 }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e5b62dc2846644c142b04423fbec10e57d81c34d..f51d86c4458135fd93f9ef9d2d1d91a083949cec 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -27,6 +27,8 @@
 #include "xfs_quota.h"
 #include "xfs_reflink.h"
 #include "xfs_iomap.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 
@@ -514,6 +516,138 @@ xfs_reflink_fill_delalloc(
 	return error;
 }
 
+#ifdef CONFIG_FS_DAX
+STATIC int
+xfs_reflink_unshare_range(
+	struct xfs_inode	*src,
+	struct xfs_bmbt_irec	*oimap,
+	bool *secondary_evicting)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	struct xfs_inode	*ip;
+	xfs_fileoff_t		offset_fsb = oimap->br_startoff;
+	xfs_filblks_t		count_fsb = oimap->br_blockcount;
+	struct xfs_trans	*tp;
+	int			nimaps, error = 0;
+	bool			shared, found;
+	xfs_filblks_t		resaligned;
+	xfs_extlen_t		resblks = 0;
+	uint			lockmode = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_bmbt_irec	imap = *oimap;
+	struct xfs_bmbt_irec	cmap;
+
+	mutex_lock(&mp->m_reflink_opt_lock);
+	ip = src->i_reflink_opt_ip;
+	if (!ip || !igrab(VFS_I(ip))) {
+		mutex_unlock(&mp->m_reflink_opt_lock);
+		*secondary_evicting = true;
+		return 0;
+	}
+	mutex_unlock(&mp->m_reflink_opt_lock);
+
+	xfs_ilock(ip, lockmode);
+	xfs_flush_unmap_range(ip, XFS_FSB_TO_B(mp, imap.br_startoff),
+			XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+	error = xfs_find_trim_cow_extent(ip, &imap, &cmap, &shared, &found);
+	if (error || !shared)
+		goto error;
+
+	if (found)
+		goto convert;
+
+	resaligned = xfs_aligned_fsb_count(imap.br_startoff,
+		imap.br_blockcount, xfs_get_cowextsz_hint(ip));
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	if (error) {
+		lockmode = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+		goto error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_qm_dqattach_locked(ip, false);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Check for an overlapping extent again now that we dropped the ilock.
+	 */
+	error = xfs_find_trim_cow_extent(ip, &imap, &cmap, &shared, &found);
+	if (error || !shared)
+		goto out_trans_cancel;
+	if (found) {
+		xfs_trans_cancel(tp);
+		goto convert;
+	}
+
+	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+			XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Allocate the entire reservation as zeroed blocks. */
+	nimaps = 1;
+	error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_ZERO, resblks, &cmap, &nimaps);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_inode_set_cowblocks_tag(ip);
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto error;
+
+	/*
+	 * Allocation succeeded but the requested range was not even partially
+	 * satisfied?  Bail out!
+	 */
+	if (nimaps == 0) {
+		error = -ENOSPC;
+		goto error;
+	}
+convert:
+	xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+	trace_xfs_reflink_convert_cow(ip, &cmap);
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	if (error)
+		goto error;
+	cmap.br_state = XFS_EXT_NORM;
+	dax_copy_range(xfs_inode_buftarg(ip)->bt_bdev,
+			xfs_inode_buftarg(ip)->bt_daxdev,
+			BBTOB(xfs_fsb_to_db(ip, oimap->br_startblock)),
+			BBTOB(xfs_fsb_to_db(ip, cmap.br_startblock)),
+			XFS_FSB_TO_B(mp, cmap.br_blockcount));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_reflink_end_cow(ip, XFS_FSB_TO_B(mp, cmap.br_startoff),
+			XFS_FSB_TO_B(mp, cmap.br_blockcount));
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+	xfs_irele(ip);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+
+error:
+	xfs_iunlock(ip, lockmode);
+	xfs_irele(ip);
+	return error;
+}
+#else
+STATIC int
+xfs_reflink_unshare_range(
+	struct xfs_inode	*src,
+	struct xfs_bmbt_irec	*oimap)
+{
+	return 0;
+}
+#endif
+
 /* Allocate all CoW reservations covering a range of blocks in a file. */
 int
 xfs_reflink_allocate_cow(
@@ -526,6 +660,7 @@ xfs_reflink_allocate_cow(
 {
 	int			error;
 	bool			found;
+	bool			secondary_evicting = false;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	if (!ip->i_cowfp) {
@@ -542,6 +677,26 @@ xfs_reflink_allocate_cow(
 		return xfs_reflink_convert_unwritten(ip, imap, cmap,
 				convert_now);
 
+	if (ip->i_reflink_flags & XFS_REFLINK_PRIMARY) {
+		xfs_iunlock(ip, *lockmode);
+		error = xfs_reflink_unshare_range(ip, imap,
+				&secondary_evicting);
+		xfs_ilock(ip, *lockmode);
+		if (error) {
+			xfs_warn(ip->i_mount,
+				 "failed to unshare secondary range @ ino %llu",
+				 ip->i_ino);
+		} else if (secondary_evicting) {
+			/*
+			 * It's impossible to have another reflink here (racing with
+			 * FICLONE) since ip takes XFS_MMAPLOCK_SHARED lock and FICLONE
+			 * needs XFS_MMAPLOCK_EXEC.
+			 */
+			*shared = false;
+			return 0;
+		}
+	}
+
 	/*
 	 * CoW fork does not have an extent and data extent is shared.
 	 * Allocate a real extent in the CoW fork.
@@ -1500,6 +1655,27 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
+	if (src->i_reflink_flags & XFS_REFLINK_PRIMARY) {
+		if (!(dest->i_reflink_flags & XFS_REFLINK_SECONDARY))
+			goto out_unlock;
+		if (pos_in != pos_out)
+			goto out_unlock;
+		if (src->i_reflink_opt_ip || dest->i_reflink_opt_ip) {
+			xfs_warn(src->i_mount,
+				 "src(XFS_REFLINK_PRIMARY) and/or dest(XFS_REFLINK_SECONDARY) is already paired with FICLONE");
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * For inodes flagged with XFS_REFLINK_{PRIMARY, SECONDARY},
+	 * users do not need persistence, so we can apply fast reflink,
+	 * i.e., write protect without flushing dirty.
+	 */
+	if (src->i_reflink_flags & (XFS_REFLINK_PRIMARY |
+				    XFS_REFLINK_SECONDARY))
+		remap_flags |= REMAP_FILE_FAST_REFLINK;
+
 	if (!IS_DAX(inode_in))
 		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
 				pos_out, len, remap_flags);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 13007b6bc9f3378a9dd24820602a829ea10a896f..b6db17213816502c3d27dfa1ee977b09657a543a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1528,6 +1528,11 @@ xfs_fs_fill_super(
 #endif
 	sb->s_op = &xfs_super_operations;
 
+	spin_lock_init(&mp->m_reflink_opt_gclock);
+	INIT_LIST_HEAD(&mp->m_reflink_opt_gclist);
+	INIT_WORK(&mp->m_reflink_opt_gcwork, xfs_inodegc_reflink_opt_worker);
+	init_waitqueue_head(&mp->m_reflink_opt_wait);
+
 	/*
 	 * Delay mount work if the debug hook is set. This is debug
 	 * instrumention to coordinate simulation of xfs mount failures with
@@ -2002,6 +2007,8 @@ static int xfs_init_fs_context(
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	spin_lock_init(&mp->m_perag_lock);
 	mutex_init(&mp->m_growlock);
+	mutex_init(&mp->m_reflink_opt_lock);
+
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fade337353931663ec17ff72a47d74b9053c901c..6b93b230166cbe08202f19d3b83bfb670d392af9 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -113,6 +113,15 @@ static struct ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.syncd_timer.min,
 		.extra2		= &xfs_params.syncd_timer.max
 	},
+	{
+		.procname	= "reflink_inactive_force_log_period",
+		.data		= &xfs_params.reflink_inactive_force_log_period.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.reflink_inactive_force_log_period.min,
+		.extra2		= &xfs_params.reflink_inactive_force_log_period.max
+	},
 	{
 		.procname	= "inherit_sync",
 		.data		= &xfs_params.inherit_sync.val,
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index f78ad6b10ea58dedb0f93773abfc5f156de08f85..726eb447bb49ff4221280f5630cab288a933b5d2 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -36,6 +36,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
 	xfs_sysctl_val_t blockgc_timer;	/* Interval between blockgc scans */
+	xfs_sysctl_val_t reflink_inactive_force_log_period;
 } xfs_param_t;
 
 /*
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b463502b16e17fbc08ff3a982a9597f5d70cb06b..66e663664accaf5ebb462b74dff39fc3224a73e2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -128,6 +128,8 @@ void set_dax_nocache(struct dax_device *dax_dev);
 void set_dax_nomc(struct dax_device *dax_dev);
 
 struct writeback_control;
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		   pgoff_t *pgoff);
 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
@@ -248,6 +250,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
+int dax_copy_range(struct block_device *bdev, struct dax_device *dax_dev,
+		   u64 src_addr, u64 dst_addr, size_t size);
 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 				  struct inode *dest, loff_t destoff,
 				  loff_t len, bool *is_same,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dcf1acd8f6d02047891d34b4ffcc4f7fcb184dac..5932a74a710cd59c87c96e08ccbbe263f25e4f68 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -495,6 +495,8 @@ struct address_space {
 	struct rw_semaphore	i_mmap_rwsem;
 	void			*private_data;
 
+	struct fast_reflink_work *fast_reflink_work;
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
@@ -1911,6 +1913,8 @@ struct dir_context {
  */
 #define REMAP_FILE_ADVISORY		(REMAP_FILE_CAN_SHORTEN)
 
+#define REMAP_FILE_FAST_REFLINK		(1 << 2)
+
 /*
  * These flags control the behavior of vfs_copy_file_range().
  * They are not available to the user via syscall.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a8eb5c530152cf5ac14ed63ef1beb46ded8036eb..e0ceb4a6d25468330930157d311aef325aa9376d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4184,8 +4184,22 @@ static inline void async_fork_fixup_vma(struct vm_area_struct *mpnt)
 }
 #endif
 
+struct fast_reflink_work {
+	struct work_struct work;
+	struct address_space *mapping;
+};
+
+int fast_reflink_apply(struct address_space *mapping, pgoff_t start,
+		       pgoff_t end);
+bool is_pmd_fast_reflink(pmd_t pmd);
+void fast_reflink_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			    unsigned long addr);
+void fast_reflink_fixup_vma(struct vm_area_struct *vma);
+
 static inline bool is_pmd_transient(pmd_t pmd)
 {
+	if (is_pmd_fast_reflink(pmd))
+		return true;
 	if (is_pmd_async_fork(pmd))
 		return true;
 	return false;
@@ -4193,10 +4207,12 @@ static inline bool is_pmd_transient(pmd_t pmd)
 static inline void fixup_pmd(struct vm_area_struct *vma,
 				pmd_t *pmd, unsigned long addr)
 {
+	fast_reflink_fixup_pmd(vma, pmd, addr);
 	async_fork_fixup_pmd(vma, pmd, addr);
 }
 static inline void fixup_vma(struct vm_area_struct *vma)
 {
+	fast_reflink_fixup_vma(vma);
 	async_fork_fixup_vma(vma);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7383b2a530ba75aaa626bc9bfad8cdfe68e9435d..968ad624e596a06a7b3303682331ce9a78c74511 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -704,6 +704,8 @@ struct vm_area_struct {
 	struct vm_area_struct *async_fork_vma;
 #endif
 
+	bool fast_reflink;
+
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
 	CK_KABI_RESERVE(3)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5f4b2f18d8d5fdbdc8ef868e13fd031dc717a563..9a90da047f49186cd364b8e9e8f18b92c12266c3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -642,7 +642,7 @@ __PAGEFLAG(Kfence, kfence, PF_ANY)
  * Different with flags above, this flag is used only for fsdax mode.  It
  * indicates that this page->mapping is now under reflink case.
  */
-#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
+#define PAGE_MAPPING_DAX_SHARED	0x1UL
 
 static __always_inline bool folio_mapping_flags(struct folio *folio)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3e70c51d3862c4504be99ac3066649005a6551ba..7b838458b9502d255d104516b317666b08c90a25 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -206,6 +206,8 @@ enum mapping_flags {
 	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
 	AS_STABLE_WRITES,	/* must wait for writeback before modifying
 				   folio contents */
+
+	AS_FSDAX_NORMAP = 30,
 };
 
 /**
diff --git a/mm/memory.c b/mm/memory.c
index e4b4221d2617b1eb4e7cf1c09668ef416ecc04ae..3bf1d648699cccdb5fa2e6899d0c7e9a0ad12dcd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6608,3 +6608,336 @@ void ptlock_free(struct ptdesc *ptdesc)
 	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
 }
 #endif
+
+/* Fast reflink */
+static inline bool is_pmd_tbl_wrprotect(pmd_t pmd)
+{
+#if defined(CONFIG_ARM64)
+#define PMD_SECT_AP_WRPROTECT (_AT(pmdval_t, 2) << 61)	/* APTable[1:0] */
+	return (pmd_val(pmd) & PMD_TABLE_BIT) &&
+		(pmd_val(pmd) & PMD_SECT_AP_WRPROTECT);
+#elif defined(CONFIG_X86)
+	return (pmd_flags(pmd) & ~_PAGE_USER) == (_KERNPG_TABLE & ~_PAGE_RW);
+#else
+	return false;
+#endif
+}
+
+static inline void pmdp_set_tbl_wrprotect(struct mm_struct *mm,
+					  unsigned long addr, pmd_t *pmdp)
+{
+#if defined(CONFIG_ARM64)
+	set_pmd(pmdp, __pmd(pmd_val(*pmdp) | PMD_SECT_AP_WRPROTECT));
+#elif defined(CONFIG_X86)
+	pmdp_set_wrprotect(mm, addr, pmdp);
+#endif
+}
+
+static inline void pmdp_clear_tbl_wrprotect(pmd_t *pmdp,
+					    struct vm_area_struct *vma)
+{
+#if defined(CONFIG_ARM64)
+	set_pmd(pmdp, __pmd(pmd_val(*pmdp) & ~PMD_SECT_AP_WRPROTECT));
+#elif defined(CONFIG_X86)
+	set_pmd(pmdp, pmd_mkwrite(*pmdp, vma));
+#endif
+}
+
+bool is_pmd_fast_reflink(pmd_t pmd)
+{
+	return !is_swap_pmd(pmd) && !pmd_trans_huge(pmd) &&
+		!pmd_devmap(pmd) && is_pmd_tbl_wrprotect(pmd);
+}
+
+static int follow_pmd(struct mm_struct *mm, unsigned long address,
+		      pmd_t **pmdp)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out;
+
+	p4d = p4d_offset(pgd, address);
+	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+		goto out;
+
+	pud = pud_offset(p4d, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_huge(*pmd))
+		goto found;
+
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto out;
+
+found:
+	*pmdp = pmd;
+	return 0;
+out:
+	return -EINVAL;
+}
+
+static void fr_apply_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+			       unsigned long start, unsigned long end)
+{
+	pte_t *start_pte;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+
+	start_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+	ptep = start_pte;
+
+	do {
+		pte = *ptep;
+		if (pte_none(pte))
+			continue;
+
+		if (!pte_dirty(pte) && !pte_write(pte))
+			continue;
+
+		/* The caller is responsible for tlb flush. */
+		pte = ptep_get_and_clear(vma->vm_mm, start, ptep);
+		pte = pte_wrprotect(pte);
+		pte = pte_mkclean(pte);
+		set_pte_at(vma->vm_mm, start, ptep, pte);
+	} while (ptep++, start += PAGE_SIZE, start != end);
+
+	pte_unmap_unlock(start_pte, ptl);
+}
+
+static void fr_apply_vma(struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	unsigned long next;
+	spinlock_t *pml;
+	pmd_t *pmdp = NULL;
+	pmd_t pmd;
+	bool applied = false;
+
+	do {
+		next = pmd_addr_end(start, end);
+		if (follow_pmd(mm, start, &pmdp))
+			continue;
+
+		pml = pmd_lock(mm, pmdp);
+		if (pmd_huge(*pmdp)) {
+#ifdef CONFIG_FS_DAX_PMD
+			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+				goto unlock_pmd;
+
+			pmd = pmdp_invalidate(vma, start, pmdp);
+			pmd = pmd_wrprotect(pmd);
+			pmd = pmd_mkclean(pmd);
+			set_pmd_at(mm, start, pmdp, pmd);
+unlock_pmd:
+#endif
+			spin_unlock(pml);
+			continue;
+		}
+
+		if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp))) {
+			spin_unlock(pml);
+			continue;
+		}
+
+		if (IS_ALIGNED(start, PMD_SIZE) && (start + PMD_SIZE <= end)) {
+			pmdp_set_tbl_wrprotect(mm, start, pmdp);
+			flush_tlb_range(vma, start, start + PMD_SIZE);
+			applied = true;
+			spin_unlock(pml);
+			continue;
+		} else {
+			spin_unlock(pml);
+			fr_apply_pte_range(vma, pmdp, start, next);
+			flush_tlb_range(vma, start, next);
+			continue;
+		}
+	} while (start = next, start != end);
+
+	if (applied)
+		vma->fast_reflink = applied;
+}
+
+static void fast_reflink_fixup(struct work_struct *work);
+int fast_reflink_apply(struct address_space *mapping, pgoff_t start,
+		       pgoff_t end)
+{
+	struct vm_area_struct *vma;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+		if (!(vma->vm_flags & VM_SHARED))
+			continue;
+
+		fr_apply_vma(vma);
+	}
+	i_mmap_unlock_read(mapping);
+
+	if (!mapping->fast_reflink_work) {
+		struct fast_reflink_work *fr_work;
+
+		fr_work = kmalloc(sizeof(*fr_work), GFP_KERNEL|__GFP_NOFAIL);
+		INIT_WORK(&fr_work->work, fast_reflink_fixup);
+		fr_work->mapping = mapping;
+		mapping->fast_reflink_work = fr_work;
+	}
+	schedule_work(&mapping->fast_reflink_work->work);
+
+	return 0;
+}
+
+static void fr_fixup_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+			       unsigned long start, unsigned long end)
+{
+	pte_t *start_pte;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+
+	start_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+	ptep = start_pte;
+
+	/* Already fixed up */
+	if (unlikely(!is_pmd_fast_reflink(*pmd)))
+		goto out;
+
+	do {
+		pte = *ptep;
+		if (pte_none(pte))
+			continue;
+
+		if (!pte_dirty(pte) && !pte_write(pte))
+			continue;
+
+		/* The caller is responsible for tlb flush. */
+		pte = ptep_get_and_clear(vma->vm_mm, start, ptep);
+		pte = pte_wrprotect(pte);
+		pte = pte_mkclean(pte);
+		set_pte_at(vma->vm_mm, start, ptep, pte);
+	} while (ptep++, start += PAGE_SIZE, start != end);
+
+out:
+	pte_unmap_unlock(start_pte, ptl);
+}
+
+static void fr_fixup_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+			       unsigned long start, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	spinlock_t *pml;
+
+	pmd = pmd_offset(pud, start);
+	do {
+		next = pmd_addr_end(start, end);
+		if (pmd_none(*pmd))
+			continue;
+
+		pml = pmd_lock(vma->vm_mm, pmd);
+		if (is_pmd_fast_reflink(*pmd)) {
+			spin_unlock(pml);
+			fr_fixup_pte_range(vma, pmd, start, next);
+
+			pml = pmd_lock(vma->vm_mm, pmd);
+			if (is_pmd_fast_reflink(*pmd))
+				pmdp_clear_tbl_wrprotect(pmd, vma);
+		}
+		spin_unlock(pml);
+	} while (pmd++, start = next, start != end);
+}
+
+static void fr_fixup_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
+			       unsigned long start, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(p4d, start);
+	do {
+		next = pud_addr_end(start, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		fr_fixup_pmd_range(vma, pud, start, next);
+	} while (pud++, start = next, start != end);
+}
+
+static void fr_fixup_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
+			       unsigned long start, unsigned long end)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, start);
+	do {
+		next = p4d_addr_end(start, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		fr_fixup_pud_range(vma, p4d, start, next);
+	} while (p4d++, start = next, start != end);
+}
+
+static void fr_fixup_page_range(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	pgd = pgd_offset(vma->vm_mm, start);
+	do {
+		next = pgd_addr_end(start, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		fr_fixup_p4d_range(vma, pgd, start, next);
+	} while (pgd++, start = next, start != end);
+}
+
+/* The mmap_lock (read/write) of vma->vm_mm is held */
+void fast_reflink_fixup_vma(struct vm_area_struct *vma)
+{
+	if (!vma->fast_reflink)
+		return;
+
+	fr_fixup_page_range(vma, vma->vm_start, vma->vm_end);
+	vma->fast_reflink = false;
+#ifdef CONFIG_ARM64
+	flush_tlb_range(vma, vma->vm_start, vma->vm_end);
+#endif
+}
+
+/* The mmap_lock (read) of vma->vm_mm is held */
+void fast_reflink_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			    unsigned long addr)
+{
+	if (!is_pmd_fast_reflink(*pmd) || !vma->fast_reflink)
+		return;
+
+	addr &= PMD_MASK;
+	fr_fixup_page_range(vma, addr, addr + PMD_SIZE);
+	VM_WARN_ON_ONCE(is_pmd_fast_reflink(*pmd));
+
+#ifdef CONFIG_ARM64
+	flush_tlb_range(vma, addr & PMD_MASK, (addr & PMD_MASK) + PMD_SIZE);
+#endif
+}
+
+static void fast_reflink_fixup(struct work_struct *work)
+{
+	struct fast_reflink_work *fr_work;
+	struct address_space *mapping;
+	struct vm_area_struct *vma;
+
+	fr_work = container_of(work, struct fast_reflink_work, work);
+	mapping = fr_work->mapping;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
+		fast_reflink_fixup_vma(vma);
+	i_mmap_unlock_read(mapping);
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index 8e3aa9e8618ed8262357edaf550a10425fa95828..126fe71bc6a14c3bd7c4d750a65190a3b056c244 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -469,6 +469,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
 	 */
 	mapping_set_exiting(mapping);
 
+	/* Flush fast reflink work if any. */
+	if (unlikely(mapping->fast_reflink_work)) {
+		flush_work(&mapping->fast_reflink_work->work);
+
+		kfree(mapping->fast_reflink_work);
+		mapping->fast_reflink_work = NULL;
+	}
+
 	if (!mapping_empty(mapping)) {
 		/*
 		 * As truncation uses a lockless tree lookup, cycle