diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0da9232ea1754b862455fe6d2d1c0e57398cfde1..6f2bc78f6184c84d09b52d1520c86cfbf3bc956e 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -71,6 +71,20 @@ void dax_remove_host(struct gendisk *disk) } EXPORT_SYMBOL_GPL(dax_remove_host); +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) +{ + sector_t start_sect = bdev ? get_start_sect(bdev) : 0; + phys_addr_t phys_off = (start_sect + sector) * 512; + + if (pgoff) + *pgoff = PHYS_PFN(phys_off); + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(bdev_dax_pgoff); + /** * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for diff --git a/fs/dax.c b/fs/dax.c index 3380b43cb6bbbd1289901a3e29edc903abe6156e..544a4c3c1d6277e0a9c06c3085ae378bb1c06918 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -322,23 +322,34 @@ static unsigned long dax_end_pfn(void *entry) static inline bool dax_page_is_shared(struct page *page) { - return page->mapping == PAGE_MAPPING_DAX_SHARED; + return (unsigned long)READ_ONCE(page->mapping) & PAGE_MAPPING_DAX_SHARED; } /* * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the * refcount. */ -static inline void dax_page_share_get(struct page *page) +static inline void dax_page_share_get(struct page *page, + struct address_space *mapping, pgoff_t index) { - if (page->mapping != PAGE_MAPPING_DAX_SHARED) { + struct address_space *oldmapping = READ_ONCE(page->mapping); + + if (!((unsigned long)oldmapping & PAGE_MAPPING_DAX_SHARED)) { /* * Reset the index if the page was already mapped * regularly before. */ - if (page->mapping) + if (oldmapping) page->share = 1; - page->mapping = PAGE_MAPPING_DAX_SHARED; + + if (test_bit(AS_FSDAX_NORMAP, &mapping->flags)) { + /* Note that we (ab)use page->private to keep index for now */ + WRITE_ONCE(page->private, index); + /* paired with smp_mb() in xfs_dax_notify_ddev_failure2() */ + smp_mb(); + } + WRITE_ONCE(page->mapping, + (void *)((unsigned long)mapping | PAGE_MAPPING_DAX_SHARED)); } page->share++; } @@ -367,7 +378,7 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); if (shared) { - dax_page_share_get(page); + dax_page_share_get(page, mapping, index); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -1063,6 +1074,46 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); +int dax_copy_range(struct block_device *bdev, struct dax_device *dax_dev, + u64 src_addr, u64 dst_addr, size_t size) +{ + const sector_t src_sector = src_addr >> SECTOR_SHIFT; + const sector_t dst_sector = dst_addr >> SECTOR_SHIFT; + pgoff_t spgoff, dpgoff; + int id, rc; + long length; + void *saddr, *daddr; + + rc = bdev_dax_pgoff(bdev, src_sector, size, &spgoff); + if (rc) + return rc; + + rc = bdev_dax_pgoff(bdev, dst_sector, size, &dpgoff); + if (rc) + return rc; + + id = dax_read_lock(); + length = dax_direct_access(dax_dev, spgoff, PHYS_PFN(size), DAX_ACCESS, + &saddr, NULL); + if (length < 0) { + rc = length; + goto out; + } + + length = dax_direct_access(dax_dev, dpgoff, PHYS_PFN(size), DAX_ACCESS, + &daddr, NULL); + if (length < 0) { + rc = length; + goto out; + } + + rc = copy_mc_to_kernel(daddr, saddr, size); +out: + dax_read_unlock(id); + return rc; +} +EXPORT_SYMBOL_GPL(dax_copy_range); + static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, size_t size, void **kaddr, pfn_t *pfnp) { diff --git a/fs/remap_range.c b/fs/remap_range.c index 87ae4f0dc3aa01c6099ef2fa7a66b5d84bdb9703..2fdc13c901ef30cc0186ffa7cf875c307075846d 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -315,10 +315,18 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (!same_inode) inode_dio_wait(inode_out); - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; + if (remap_flags & REMAP_FILE_FAST_REFLINK) { + ret = fast_reflink_apply(inode_in->i_mapping, + pos_in >> PAGE_SHIFT, + (pos_in + *len - 1) >> PAGE_SHIFT); + if (ret) + return ret; + } else { + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + } ret = filemap_write_and_wait_range(inode_out->i_mapping, pos_out, pos_out + *len - 1); diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 6360073865dbc3136e3655e1b493a53ca4310dd3..7a38c22787bcd5f548db439244345f34e0cf1565 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -766,6 +766,11 @@ struct xfs_scrub_metadata { # define XFS_XATTR_LIST_MAX 65536 #endif +enum { + XFS_REFLINK_NORMAL = 0, + XFS_REFLINK_PRIMARY = (1 << 0), + XFS_REFLINK_SECONDARY = (1 << 1), +}; /* * ioctl commands that are used by Linux filesystems @@ -840,6 +845,10 @@ struct xfs_scrub_metadata { /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +#define XFS_IOC_SET_REFLINK_FLAGS _IOW('X', 200, uint32_t) +#define XFS_IOC_GET_REFLINK_FLAGS _IOR('X', 201, uint32_t) +#define XFS_IOC_WAIT_REFLINK_SECONDARY _IOW('X', 202, uint32_t) + #ifndef HAVE_BBMACROS /* * Block I/O parameterization. A basic block (BB) is the lowest size of diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e33e5e13b95f462ffe33c232739af6c4762758a0..8b73002067830056bf57a40c90ab2a6e39de89f6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -877,9 +877,18 @@ xfs_break_dax_layouts( struct inode *inode, bool *retry) { + struct xfs_inode *ip = XFS_I(inode); struct page *page; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + + /* + * For inodes flagged with XFS_REFLINK_{PRIMARY, SECONDARY}, users + * can ensure there are no inflight dio operations on these inodes, + * so we can bypass xfs_break_dax_layouts(BREAK_UNMAP) safely. + */ + if (ip->i_reflink_flags & (XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) + return 0; page = dax_layout_busy_page(inode->i_mapping); if (!page) @@ -1216,6 +1225,19 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); + + if (remapped && (src->i_reflink_flags & XFS_REFLINK_PRIMARY)) { + mutex_lock(&mp->m_reflink_opt_lock); + src->i_reflink_opt_ip = dest; + dest->i_reflink_opt_ip = src; + mutex_unlock(&mp->m_reflink_opt_lock); + + if (!xfs_has_rmapbt(mp)) { + set_bit(AS_FSDAX_NORMAP, &VFS_I(src)->i_mapping->flags); + set_bit(AS_FSDAX_NORMAP, &VFS_I(dest)->i_mapping->flags); + } + } + out_unlock: xfs_iunlock2_remapping(src, dest); if (ret) diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc9399eca73414ffa2f6da89db7f59964..afeeef53a7b026c1aa1436cebbc2cf191884658c 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -30,6 +30,7 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, .blockgc_timer = { 1, 300, 3600*24}, + .reflink_inactive_force_log_period = { 0, 5, 1000 }, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c210ac8371368bc00a77fb9df52315b710d17ce..090c8a1a68c71a814d0049fae70f2ba6bcfabe55 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -107,6 +107,8 @@ xfs_inode_alloc( ip->i_diflags2 = mp->m_ino_geo.new_diflags2; ip->i_nblocks = 0; ip->i_forkoff = 0; + ip->i_reflink_flags = 0; + ip->i_reflink_opt_ip = NULL; ip->i_sick = 0; ip->i_checked = 0; INIT_WORK(&ip->i_ioend_work, xfs_end_io); @@ -114,7 +116,7 @@ xfs_inode_alloc( spin_lock_init(&ip->i_ioend_lock); ip->i_next_unlinked = NULLAGINO; ip->i_prev_unlinked = 0; - + INIT_LIST_HEAD(&ip->i_reflink_opt_gclist); return ip; } @@ -385,6 +387,8 @@ xfs_iget_recycle( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; + ip->i_reflink_flags = 0; + ip->i_reflink_opt_ip = NULL; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); inode->i_state = I_NEW; @@ -1824,7 +1828,7 @@ xfs_inodegc_set_reclaimable( * This is the last chance to make changes to an otherwise unreferenced file * before incore reclamation happens. */ -static int +int xfs_inodegc_inactivate( struct xfs_inode *ip) { @@ -1837,6 +1841,40 @@ xfs_inodegc_inactivate( } +void +xfs_inodegc_reflink_opt_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_reflink_opt_gcwork); + struct xfs_inode *ip; + + while (1) { + spin_lock(&mp->m_reflink_opt_gclock); + /* + * fg ioctl can handle a specific inode too. In that case, + * we will not see such inode on the list anymore. + */ + if (list_empty(&mp->m_reflink_opt_gclist)) { + spin_unlock(&mp->m_reflink_opt_gclock); + break; + } + ip = list_first_entry(&mp->m_reflink_opt_gclist, + struct xfs_inode, i_reflink_opt_gclist); + /* + * Or we detach it ourselves in the gclock, in that case, + * fg ioctl will hit list_empty (fg ioctl also check + * list_empty under gclock). + */ + list_del_init(&ip->i_reflink_opt_gclist); + spin_unlock(&mp->m_reflink_opt_gclock); + + ASSERT(ip->i_flags & XFS_NEED_INACTIVE); + xfs_iflags_set(ip, XFS_INACTIVATING); + xfs_inodegc_inactivate(ip); + } +} + void xfs_inodegc_worker( struct work_struct *work) @@ -2069,6 +2107,22 @@ xfs_inodegc_queue( unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); + + if ((ip->i_reflink_flags & XFS_REFLINK_SECONDARY) && + /* ip->i_reflink_opt_ip won't be changed here since we're the owner */ + READ_ONCE(ip->i_reflink_opt_ip)) { + /* gclist will be attached before marking XFS_NEED_INACTIVE */ + spin_lock(&mp->m_reflink_opt_gclock); + list_add_tail(&ip->i_reflink_opt_gclist, + &mp->m_reflink_opt_gclist); + queue_work(mp->m_inodegc_wq, + &mp->m_reflink_opt_gcwork); + spin_unlock(&mp->m_reflink_opt_gclock); + wake_up_all(&mp->m_reflink_opt_wait); + xfs_iflags_set(ip, XFS_NEED_INACTIVE); + return; + } + spin_lock(&ip->i_flags_lock); ip->i_flags |= XFS_NEED_INACTIVE; spin_unlock(&ip->i_flags_lock); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 905944dafbe539245bc30fe2df2d8681af0c5a04..6646eb2a7654af8ebbfff7ea83328fa8d5ea6b8f 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -70,10 +70,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_inodegc_inactivate(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); +void xfs_inodegc_reflink_opt_worker(struct work_struct *work); void xfs_inodegc_worker(struct work_struct *work); void xfs_inodegc_push(struct xfs_mount *mp); int xfs_inodegc_flush(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f9d29acd72b9eea15fd108420d5d86a839696cce..c7af345bd58e33b1dcf15e8077814aaed180461d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1341,6 +1341,8 @@ xfs_itruncate_extents_flags( xfs_fileoff_t first_unmap_block; xfs_filblks_t unmap_len; int error = 0; + bool secondary_inactive = false; + int force_count = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1371,9 +1373,13 @@ xfs_itruncate_extents_flags( return 0; } + if (!new_size && (ip->i_reflink_flags & XFS_REFLINK_SECONDARY)) + secondary_inactive = true; + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; while (unmap_len > 0) { ASSERT(tp->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS); if (error) @@ -1383,6 +1389,14 @@ xfs_itruncate_extents_flags( error = xfs_defer_finish(&tp); if (error) goto out; + + if (secondary_inactive) { + if (xfs_reflink_inactive_force_log_period && + ++force_count >= xfs_reflink_inactive_force_log_period) { + xfs_log_force(mp, 0); + force_count = 0; + } + } } if (whichfork == XFS_DATA_FORK) { @@ -1685,6 +1699,33 @@ xfs_inode_needs_inactive( return xfs_can_free_eofblocks(ip, true); } +STATIC void +xfs_reflink_opt_disconnect( + struct xfs_mount *mp, + struct xfs_inode *ip, + bool unexpected) +{ + bool valid = false; + + if (!(ip->i_reflink_flags & (XFS_REFLINK_PRIMARY | + XFS_REFLINK_SECONDARY))) + return; + + mutex_lock(&mp->m_reflink_opt_lock); + if (ip->i_reflink_opt_ip) { + ip->i_reflink_opt_ip->i_reflink_opt_ip = NULL; + ip->i_reflink_opt_ip = NULL; + valid = true; + } + mutex_unlock(&mp->m_reflink_opt_lock); + if (valid) { + wake_up_all(&mp->m_reflink_opt_wait); + if (unexpected) + xfs_warn(mp, "unexpectedly, inactive reflink file in advance %llu", + ip->i_ino); + } +} + /* * xfs_inactive * @@ -1741,6 +1782,7 @@ xfs_inactive( if (xfs_can_free_eofblocks(ip, true)) error = xfs_free_eofblocks(ip); + xfs_reflink_opt_disconnect(mp, ip, true); goto out; } @@ -1772,6 +1814,8 @@ xfs_inactive( if (error) goto out; + xfs_reflink_opt_disconnect(mp, ip, false); + /* * If there are attributes associated with the file then blow them away * now. The code calls a routine that recursively deconstructs the diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f18920d6730b32e5d1edcf5530b93b327..b99e6433b27645d3622c6059906b1af3df45ba54 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -85,6 +85,15 @@ typedef struct xfs_inode { */ xfs_agino_t i_prev_unlinked; + /* flags for controlling reflink cow behavior */ + uint32_t i_reflink_flags; + /* + * Saved reflink ip for the sake of quick unshare, currently we + * only support one reflink file under flag XFS_REFLINK_PRIMARY + */ + struct xfs_inode *i_reflink_opt_ip; + struct list_head i_reflink_opt_gclist; + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 535f6d38cdb54032c9e90cdde35ada7094a0e515..0ad3844f4ff22ab4c77e1e4dbcf041d10e303029 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1880,6 +1880,93 @@ xfs_fs_eofblocks_from_user( #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +static bool +xfs_need_wait_reflink_secondary( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + struct xfs_inode *sip; + + mutex_lock(&mp->m_reflink_opt_lock); + sip = ip->i_reflink_opt_ip; + if (!sip /* pair nolonger valid */ || + (READ_ONCE(sip->i_flags) & XFS_NEED_INACTIVE) /* retry now */) { + mutex_unlock(&mp->m_reflink_opt_lock); + return false; + } + mutex_unlock(&mp->m_reflink_opt_lock); + return true; +} + +int +xfs_wait_reflink_secondary( + struct xfs_mount *mp, + struct xfs_inode *ip, + u32 timeout_sec) +{ + struct xfs_inode *sip; + unsigned long expire = 0; + + if (!(ip->i_reflink_flags & XFS_REFLINK_PRIMARY)) + return -EINVAL; + if (timeout_sec) + expire = jiffies + HZ * timeout_sec; +retry: + mutex_lock(&mp->m_reflink_opt_lock); + sip = ip->i_reflink_opt_ip; + if (!sip) { + mutex_unlock(&mp->m_reflink_opt_lock); + return 0; + } + spin_lock(&sip->i_flags_lock); + /* + * We need to consider if this inode needs to be inactive + * immediately here. + */ + /* already inactivating now by others? */ + if ((sip->i_flags & XFS_INACTIVATING) || + /* the inode isn't reclaimable (active or race). */ + !(sip->i_flags & (XFS_NEED_INACTIVE | XFS_INACTIVATING))) { + spin_unlock(&sip->i_flags_lock); + mutex_unlock(&mp->m_reflink_opt_lock); + if (fatal_signal_pending(current)) + return -EINTR; + if (timeout_sec) { + if (time_after(jiffies, expire)) + return -ETIMEDOUT; + wait_event_killable_timeout(mp->m_reflink_opt_wait, + !xfs_need_wait_reflink_secondary(mp, ip), + HZ * timeout_sec); + } else { + wait_event_killable(mp->m_reflink_opt_wait, + !xfs_need_wait_reflink_secondary(mp, ip)); + } + goto retry; + } + spin_unlock(&sip->i_flags_lock); + + /* + * gcwork is already on the list since XFS_NEED_INACTIVE is + * set afterwards, let's try to drop this from gcwork list. + */ + spin_lock(&mp->m_reflink_opt_gclock); + /* if the bg kworker decides to handle instead, list_empty will be hit */ + if (list_empty(&sip->i_reflink_opt_gclist)) { + spin_unlock(&mp->m_reflink_opt_gclock); + mutex_unlock(&mp->m_reflink_opt_lock); + goto retry; + } + list_del_init(&sip->i_reflink_opt_gclist); + spin_unlock(&mp->m_reflink_opt_gclock); + mutex_unlock(&mp->m_reflink_opt_lock); + + /* XFS_NEED_INACTIVE will be stable here. */ + ASSERT(sip->i_flags & XFS_NEED_INACTIVE); + xfs_iflags_set(sip, XFS_INACTIVATING); + xfs_inodegc_inactivate(sip); + return 0; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -2159,6 +2246,48 @@ xfs_file_ioctl( return error; } + case XFS_IOC_SET_REFLINK_FLAGS: { + uint32_t in; + + if (get_user(in, (uint32_t __user *)arg)) + return -EFAULT; + + /* invalid values */ + if ((in & ~(XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) || + (in & (XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) == + (XFS_REFLINK_PRIMARY | XFS_REFLINK_SECONDARY)) + return -EINVAL; + + /* clearing all flags is unallowed */ + if (!in) + return -EINVAL; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!ip->i_reflink_flags) { + ip->i_reflink_flags = in; + } else if (ip->i_reflink_flags != in) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return -EINVAL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + } + + case XFS_IOC_GET_REFLINK_FLAGS: { + if (put_user(ip->i_reflink_flags, (uint32_t __user *)arg)) + return -EFAULT; + return 0; + } + + case XFS_IOC_WAIT_REFLINK_SECONDARY: { + u32 timeout_sec; + + if (get_user(timeout_sec, (uint32_t __user *)arg)) + return -EFAULT; + + return xfs_wait_reflink_secondary(mp, ip, timeout_sec); + } + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 38be600b5e1e8391c52f0fdf0db69ff6f498125a..f74bb55133d944687b8db1d760145d491f522ae6 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -69,4 +69,6 @@ int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); +int xfs_wait_reflink_secondary(struct xfs_mount *mp, struct xfs_inode *ip, u32 timeout_sec); + #endif diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e9d317a3dafe4e673c439e504bc7fc24ca503dbe..41828f4e5d7c31ccefcb9297ced6e1a291c87884 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -103,6 +103,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_blockgc_secs xfs_params.blockgc_timer.val +#define xfs_reflink_inactive_force_log_period xfs_params.reflink_inactive_force_log_period.val #define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d19cca099bc3a701786c3e5e9871f086ac3f290c..348d2eb9f649826a3e2f888256893b2365e0ac4b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -250,6 +250,12 @@ typedef struct xfs_mount { /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; + + struct mutex m_reflink_opt_lock; + spinlock_t m_reflink_opt_gclock; + struct list_head m_reflink_opt_gclist; + struct work_struct m_reflink_opt_gcwork; + struct wait_queue_head m_reflink_opt_wait; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00fe758039b0811bfc9568b28dd1466..30655551dc756bf6db32a64d00c0d6bb3392146b 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include #include +#include struct xfs_failure_info { xfs_agblock_t startblock; @@ -173,6 +174,128 @@ xfs_dax_notify_ddev_failure( return error; } +static int +xfs_mf_dax_kill_procs( + struct xfs_mount *mp, + struct address_space *mapping, + pgoff_t pgoff, + unsigned long nrpages, + int mf_flags, + bool share) +{ + int rc, rc2 = 0; + + if (share) { + struct xfs_inode *ip = XFS_I(mapping->host); + + mutex_lock(&mp->m_reflink_opt_lock); + if (ip->i_reflink_opt_ip) { + rc2 = mf_dax_kill_procs(VFS_I(ip->i_reflink_opt_ip)->i_mapping, + pgoff, nrpages, mf_flags); + } else { + xfs_warn(mp, "this mode should be only used with REFLINK_PRIMARY|REFLINK_SECONDARY @ ino %llu", + ip->i_ino); + } + mutex_unlock(&mp->m_reflink_opt_lock); + } + rc = mf_dax_kill_procs(mapping, pgoff, nrpages, mf_flags); + iput(mapping->host); + return rc ? rc : rc2; +} + +static int +xfs_dax_notify_ddev_failure2( + struct dax_device *dax_dev, + struct xfs_mount *mp, + loff_t pos, + size_t size, + int mf_flags) +{ + struct address_space *lmapping = NULL; + bool lshare = false; + pfn_t pfn; + pgoff_t pgoff, lpgoff; + unsigned long nrpages; + long length; + int rc, id; + + rc = bdev_dax_pgoff(mp->m_ddev_targp->bt_bdev, pos >> SECTOR_SHIFT, + size, &pgoff); + if (rc) + return rc; + id = dax_read_lock(); + length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, + NULL, &pfn); + if (length < 0) { + rc = length; + goto out; + } + + if (PFN_PHYS(length) < size) { + rc = -EINVAL; + goto out; + } + rc = 0; + while (length) { + struct page *page; + struct address_space *mapping; + bool share = false; + + page = pfn_t_to_page(pfn); + pfn.val++; + --length; + +retry: + rcu_read_lock(); + mapping = page ? READ_ONCE(page->mapping) : NULL; + if (mapping) { + share = (unsigned long)mapping & PAGE_MAPPING_DAX_SHARED; + mapping = (void *)((unsigned long)mapping & ~PAGE_MAPPING_DAX_SHARED); + if (!igrab(mapping->host)) { + rcu_read_unlock(); + goto retry; + } + /* paired with smp_mb() in dax_page_share_get() to ensure valid index */ + smp_mb(); + if (!share) { + pgoff = READ_ONCE(page->index); + } else { + WARN_ON(!test_bit(AS_FSDAX_NORMAP, &mapping->flags)); + pgoff = READ_ONCE(page->private); + } + } + rcu_read_unlock(); + + if (lmapping) { + if (mapping != lmapping || share != lshare || + lpgoff + nrpages != pgoff) { + rc = xfs_mf_dax_kill_procs(mp, lmapping, lpgoff, + nrpages, mf_flags, lshare); + if (rc) + break; + } else { + nrpages++; + continue; + } + } + lmapping = mapping; + lpgoff = pgoff; + lshare = share; + nrpages = 1; + } + + if (lmapping) { + int rc2; + + rc2 = xfs_mf_dax_kill_procs(mp, lmapping, lpgoff, nrpages, mf_flags, lshare); + if (!rc) + rc = rc2; + } +out: + dax_read_unlock(id); + return rc; +} + static int xfs_dax_notify_failure( struct dax_device *dax_dev, @@ -202,11 +325,6 @@ xfs_dax_notify_failure( return -EFSCORRUPTED; } - if (!xfs_has_rmapbt(mp)) { - xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); - return -EOPNOTSUPP; - } - ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; @@ -226,6 +344,9 @@ xfs_dax_notify_failure( if (offset + len - 1 > ddev_end) len = ddev_end - offset + 1; + if (!xfs_has_rmapbt(mp)) + return xfs_dax_notify_ddev_failure2(dax_dev, mp, offset, len, + mf_flags); return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), mf_flags); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc2846644c142b04423fbec10e57d81c34d..f51d86c4458135fd93f9ef9d2d1d91a083949cec 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -27,6 +27,8 @@ #include "xfs_quota.h" #include "xfs_reflink.h" #include "xfs_iomap.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" @@ -514,6 +516,138 @@ xfs_reflink_fill_delalloc( return error; } +#ifdef CONFIG_FS_DAX +STATIC int +xfs_reflink_unshare_range( + struct xfs_inode *src, + struct xfs_bmbt_irec *oimap, + bool *secondary_evicting) +{ + struct xfs_mount *mp = src->i_mount; + struct xfs_inode *ip; + xfs_fileoff_t offset_fsb = oimap->br_startoff; + xfs_filblks_t count_fsb = oimap->br_blockcount; + struct xfs_trans *tp; + int nimaps, error = 0; + bool shared, found; + xfs_filblks_t resaligned; + xfs_extlen_t resblks = 0; + uint lockmode = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_bmbt_irec imap = *oimap; + struct xfs_bmbt_irec cmap; + + mutex_lock(&mp->m_reflink_opt_lock); + ip = src->i_reflink_opt_ip; + if (!ip || !igrab(VFS_I(ip))) { + mutex_unlock(&mp->m_reflink_opt_lock); + *secondary_evicting = true; + return 0; + } + mutex_unlock(&mp->m_reflink_opt_lock); + + xfs_ilock(ip, lockmode); + xfs_flush_unmap_range(ip, XFS_FSB_TO_B(mp, imap.br_startoff), + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + error = xfs_find_trim_cow_extent(ip, &imap, &cmap, &shared, &found); + if (error || !shared) + goto error; + + if (found) + goto convert; + + resaligned = xfs_aligned_fsb_count(imap.br_startoff, + imap.br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + lockmode = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + goto error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; + + /* + * Check for an overlapping extent again now that we dropped the ilock. + */ + error = xfs_find_trim_cow_extent(ip, &imap, &cmap, &shared, &found); + if (error || !shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; + + xfs_trans_ijoin(tp, ip, 0); + + /* Allocate the entire reservation as zeroed blocks. */ + nimaps = 1; + error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_ZERO, resblks, &cmap, &nimaps); + if (error) + goto out_trans_cancel; + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto error; + + /* + * Allocation succeeded but the requested range was not even partially + * satisfied? Bail out! + */ + if (nimaps == 0) { + error = -ENOSPC; + goto error; + } +convert: + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, &cmap); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (error) + goto error; + cmap.br_state = XFS_EXT_NORM; + dax_copy_range(xfs_inode_buftarg(ip)->bt_bdev, + xfs_inode_buftarg(ip)->bt_daxdev, + BBTOB(xfs_fsb_to_db(ip, oimap->br_startblock)), + BBTOB(xfs_fsb_to_db(ip, cmap.br_startblock)), + XFS_FSB_TO_B(mp, cmap.br_blockcount)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_reflink_end_cow(ip, XFS_FSB_TO_B(mp, cmap.br_startoff), + XFS_FSB_TO_B(mp, cmap.br_blockcount)); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + xfs_irele(ip); + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); + +error: + xfs_iunlock(ip, lockmode); + xfs_irele(ip); + return error; +} +#else +STATIC int +xfs_reflink_unshare_range( + struct xfs_inode *src, + struct xfs_bmbt_irec *oimap) +{ + return 0; +} +#endif + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -526,6 +660,7 @@ xfs_reflink_allocate_cow( { int error; bool found; + bool secondary_evicting = false; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (!ip->i_cowfp) { @@ -542,6 +677,26 @@ xfs_reflink_allocate_cow( return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + if (ip->i_reflink_flags & XFS_REFLINK_PRIMARY) { + xfs_iunlock(ip, *lockmode); + error = xfs_reflink_unshare_range(ip, imap, + &secondary_evicting); + xfs_ilock(ip, *lockmode); + if (error) { + xfs_warn(ip->i_mount, + "failed to unshare secondary range @ ino %llu", + ip->i_ino); + } else if (secondary_evicting) { + /* + * It's impossible to have another reflink here (racing with + * FICLONE) since ip takes XFS_MMAPLOCK_SHARED lock and FICLONE + * needs XFS_MMAPLOCK_EXEC. + */ + *shared = false; + return 0; + } + } + /* * CoW fork does not have an extent and data extent is shared. * Allocate a real extent in the CoW fork. @@ -1500,6 +1655,27 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; + if (src->i_reflink_flags & XFS_REFLINK_PRIMARY) { + if (!(dest->i_reflink_flags & XFS_REFLINK_SECONDARY)) + goto out_unlock; + if (pos_in != pos_out) + goto out_unlock; + if (src->i_reflink_opt_ip || dest->i_reflink_opt_ip) { + xfs_warn(src->i_mount, + "src(XFS_REFLINK_PRIMARY) and/or dest(XFS_REFLINK_SECONDARY) is already paired with FICLONE"); + goto out_unlock; + } + } + + /* + * For inodes flagged with XFS_REFLINK_{PRIMARY, SECONDARY}, + * users do not need persistence, so we can apply fast reflink, + * i.e., write protect without flushing dirty. + */ + if (src->i_reflink_flags & (XFS_REFLINK_PRIMARY | + XFS_REFLINK_SECONDARY)) + remap_flags |= REMAP_FILE_FAST_REFLINK; + if (!IS_DAX(inode_in)) ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 13007b6bc9f3378a9dd24820602a829ea10a896f..b6db17213816502c3d27dfa1ee977b09657a543a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1528,6 +1528,11 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; + spin_lock_init(&mp->m_reflink_opt_gclock); + INIT_LIST_HEAD(&mp->m_reflink_opt_gclist); + INIT_WORK(&mp->m_reflink_opt_gcwork, xfs_inodegc_reflink_opt_worker); + init_waitqueue_head(&mp->m_reflink_opt_wait); + /* * Delay mount work if the debug hook is set. This is debug * instrumention to coordinate simulation of xfs mount failures with @@ -2002,6 +2007,8 @@ static int xfs_init_fs_context( INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_reflink_opt_lock); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fade337353931663ec17ff72a47d74b9053c901c..6b93b230166cbe08202f19d3b83bfb670d392af9 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -113,6 +113,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.syncd_timer.min, .extra2 = &xfs_params.syncd_timer.max }, + { + .procname = "reflink_inactive_force_log_period", + .data = &xfs_params.reflink_inactive_force_log_period.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.reflink_inactive_force_log_period.min, + .extra2 = &xfs_params.reflink_inactive_force_log_period.max + }, { .procname = "inherit_sync", .data = &xfs_params.inherit_sync.val, diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea58dedb0f93773abfc5f156de08f85..726eb447bb49ff4221280f5630cab288a933b5d2 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -36,6 +36,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */ + xfs_sysctl_val_t reflink_inactive_force_log_period; } xfs_param_t; /* diff --git a/include/linux/dax.h b/include/linux/dax.h index b463502b16e17fbc08ff3a982a9597f5d70cb06b..66e663664accaf5ebb462b74dff39fc3224a73e2 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -128,6 +128,8 @@ void set_dax_nocache(struct dax_device *dax_dev); void set_dax_nomc(struct dax_device *dax_dev); struct writeback_control; +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff); #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); @@ -248,6 +250,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_copy_range(struct block_device *bdev, struct dax_device *dax_dev, + u64 src_addr, u64 dst_addr, size_t size); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, diff --git a/include/linux/fs.h b/include/linux/fs.h index dcf1acd8f6d02047891d34b4ffcc4f7fcb184dac..5932a74a710cd59c87c96e08ccbbe263f25e4f68 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -495,6 +495,8 @@ struct address_space { struct rw_semaphore i_mmap_rwsem; void *private_data; + struct fast_reflink_work *fast_reflink_work; + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -1911,6 +1913,8 @@ struct dir_context { */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) +#define REMAP_FILE_FAST_REFLINK (1 << 2) + /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. diff --git a/include/linux/mm.h b/include/linux/mm.h index a8eb5c530152cf5ac14ed63ef1beb46ded8036eb..e0ceb4a6d25468330930157d311aef325aa9376d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4184,8 +4184,22 @@ static inline void async_fork_fixup_vma(struct vm_area_struct *mpnt) } #endif +struct fast_reflink_work { + struct work_struct work; + struct address_space *mapping; +}; + +int fast_reflink_apply(struct address_space *mapping, pgoff_t start, + pgoff_t end); +bool is_pmd_fast_reflink(pmd_t pmd); +void fast_reflink_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr); +void fast_reflink_fixup_vma(struct vm_area_struct *vma); + static inline bool is_pmd_transient(pmd_t pmd) { + if (is_pmd_fast_reflink(pmd)) + return true; if (is_pmd_async_fork(pmd)) return true; return false; @@ -4193,10 +4207,12 @@ static inline bool is_pmd_transient(pmd_t pmd) static inline void fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + fast_reflink_fixup_pmd(vma, pmd, addr); async_fork_fixup_pmd(vma, pmd, addr); } static inline void fixup_vma(struct vm_area_struct *vma) { + fast_reflink_fixup_vma(vma); async_fork_fixup_vma(vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7383b2a530ba75aaa626bc9bfad8cdfe68e9435d..968ad624e596a06a7b3303682331ce9a78c74511 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -704,6 +704,8 @@ struct vm_area_struct { struct vm_area_struct *async_fork_vma; #endif + bool fast_reflink; + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f4b2f18d8d5fdbdc8ef868e13fd031dc717a563..9a90da047f49186cd364b8e9e8f18b92c12266c3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -642,7 +642,7 @@ __PAGEFLAG(Kfence, kfence, PF_ANY) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) +#define PAGE_MAPPING_DAX_SHARED 0x1UL static __always_inline bool folio_mapping_flags(struct folio *folio) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3e70c51d3862c4504be99ac3066649005a6551ba..7b838458b9502d255d104516b317666b08c90a25 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -206,6 +206,8 @@ enum mapping_flags { AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES, /* must wait for writeback before modifying folio contents */ + + AS_FSDAX_NORMAP = 30, }; /** diff --git a/mm/memory.c b/mm/memory.c index e4b4221d2617b1eb4e7cf1c09668ef416ecc04ae..3bf1d648699cccdb5fa2e6899d0c7e9a0ad12dcd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6608,3 +6608,336 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +/* Fast reflink */ +static inline bool is_pmd_tbl_wrprotect(pmd_t pmd) +{ +#if defined(CONFIG_ARM64) +#define PMD_SECT_AP_WRPROTECT (_AT(pmdval_t, 2) << 61) /* APTable[1:0] */ + return (pmd_val(pmd) & PMD_TABLE_BIT) && + (pmd_val(pmd) & PMD_SECT_AP_WRPROTECT); +#elif defined(CONFIG_X86) + return (pmd_flags(pmd) & ~_PAGE_USER) == (_KERNPG_TABLE & ~_PAGE_RW); +#else + return false; +#endif +} + +static inline void pmdp_set_tbl_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ +#if defined(CONFIG_ARM64) + set_pmd(pmdp, __pmd(pmd_val(*pmdp) | PMD_SECT_AP_WRPROTECT)); +#elif defined(CONFIG_X86) + pmdp_set_wrprotect(mm, addr, pmdp); +#endif +} + +static inline void pmdp_clear_tbl_wrprotect(pmd_t *pmdp, + struct vm_area_struct *vma) +{ +#if defined(CONFIG_ARM64) + set_pmd(pmdp, __pmd(pmd_val(*pmdp) & ~PMD_SECT_AP_WRPROTECT)); +#elif defined(CONFIG_X86) + set_pmd(pmdp, pmd_mkwrite(*pmdp, vma)); +#endif +} + +bool is_pmd_fast_reflink(pmd_t pmd) +{ + return !is_swap_pmd(pmd) && !pmd_trans_huge(pmd) && + !pmd_devmap(pmd) && is_pmd_tbl_wrprotect(pmd); +} + +static int follow_pmd(struct mm_struct *mm, unsigned long address, + pmd_t **pmdp) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + pud = pud_offset(p4d, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_huge(*pmd)) + goto found; + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + +found: + *pmdp = pmd; + return 0; +out: + return -EINVAL; +} + +static void fr_apply_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + pte_t *start_pte; + pte_t *ptep, pte; + spinlock_t *ptl; + + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + ptep = start_pte; + + do { + pte = *ptep; + if (pte_none(pte)) + continue; + + if (!pte_dirty(pte) && !pte_write(pte)) + continue; + + /* The caller is responsible for tlb flush. */ + pte = ptep_get_and_clear(vma->vm_mm, start, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, start, ptep, pte); + } while (ptep++, start += PAGE_SIZE, start != end); + + pte_unmap_unlock(start_pte, ptl); +} + +static void fr_apply_vma(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long next; + spinlock_t *pml; + pmd_t *pmdp = NULL; + pmd_t pmd; + bool applied = false; + + do { + next = pmd_addr_end(start, end); + if (follow_pmd(mm, start, &pmdp)) + continue; + + pml = pmd_lock(mm, pmdp); + if (pmd_huge(*pmdp)) { +#ifdef CONFIG_FS_DAX_PMD + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + pmd = pmdp_invalidate(vma, start, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(mm, start, pmdp, pmd); +unlock_pmd: +#endif + spin_unlock(pml); + continue; + } + + if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp))) { + spin_unlock(pml); + continue; + } + + if (IS_ALIGNED(start, PMD_SIZE) && (start + PMD_SIZE <= end)) { + pmdp_set_tbl_wrprotect(mm, start, pmdp); + flush_tlb_range(vma, start, start + PMD_SIZE); + applied = true; + spin_unlock(pml); + continue; + } else { + spin_unlock(pml); + fr_apply_pte_range(vma, pmdp, start, next); + flush_tlb_range(vma, start, next); + continue; + } + } while (start = next, start != end); + + if (applied) + vma->fast_reflink = applied; +} + +static void fast_reflink_fixup(struct work_struct *work); +int fast_reflink_apply(struct address_space *mapping, pgoff_t start, + pgoff_t end) +{ + struct vm_area_struct *vma; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + fr_apply_vma(vma); + } + i_mmap_unlock_read(mapping); + + if (!mapping->fast_reflink_work) { + struct fast_reflink_work *fr_work; + + fr_work = kmalloc(sizeof(*fr_work), GFP_KERNEL|__GFP_NOFAIL); + INIT_WORK(&fr_work->work, fast_reflink_fixup); + fr_work->mapping = mapping; + mapping->fast_reflink_work = fr_work; + } + schedule_work(&mapping->fast_reflink_work->work); + + return 0; +} + +static void fr_fixup_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + pte_t *start_pte; + pte_t *ptep, pte; + spinlock_t *ptl; + + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + ptep = start_pte; + + /* Already fixed up */ + if (unlikely(!is_pmd_fast_reflink(*pmd))) + goto out; + + do { + pte = *ptep; + if (pte_none(pte)) + continue; + + if (!pte_dirty(pte) && !pte_write(pte)) + continue; + + /* The caller is responsible for tlb flush. */ + pte = ptep_get_and_clear(vma->vm_mm, start, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, start, ptep, pte); + } while (ptep++, start += PAGE_SIZE, start != end); + +out: + pte_unmap_unlock(start_pte, ptl); +} + +static void fr_fixup_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long start, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + spinlock_t *pml; + + pmd = pmd_offset(pud, start); + do { + next = pmd_addr_end(start, end); + if (pmd_none(*pmd)) + continue; + + pml = pmd_lock(vma->vm_mm, pmd); + if (is_pmd_fast_reflink(*pmd)) { + spin_unlock(pml); + fr_fixup_pte_range(vma, pmd, start, next); + + pml = pmd_lock(vma->vm_mm, pmd); + if (is_pmd_fast_reflink(*pmd)) + pmdp_clear_tbl_wrprotect(pmd, vma); + } + spin_unlock(pml); + } while (pmd++, start = next, start != end); +} + +static void fr_fixup_pud_range(struct vm_area_struct *vma, p4d_t *p4d, + unsigned long start, unsigned long end) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, start); + do { + next = pud_addr_end(start, end); + if (pud_none_or_clear_bad(pud)) + continue; + fr_fixup_pmd_range(vma, pud, start, next); + } while (pud++, start = next, start != end); +} + +static void fr_fixup_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long start, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, start); + do { + next = p4d_addr_end(start, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + fr_fixup_pud_range(vma, p4d, start, next); + } while (p4d++, start = next, start != end); +} + +static void fr_fixup_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + + pgd = pgd_offset(vma->vm_mm, start); + do { + next = pgd_addr_end(start, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + fr_fixup_p4d_range(vma, pgd, start, next); + } while (pgd++, start = next, start != end); +} + +/* The mmap_lock (read/write) of vma->vm_mm is held */ +void fast_reflink_fixup_vma(struct vm_area_struct *vma) +{ + if (!vma->fast_reflink) + return; + + fr_fixup_page_range(vma, vma->vm_start, vma->vm_end); + vma->fast_reflink = false; +#ifdef CONFIG_ARM64 + flush_tlb_range(vma, vma->vm_start, vma->vm_end); +#endif +} + +/* The mmap_lock (read) of vma->vm_mm is held */ +void fast_reflink_fixup_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr) +{ + if (!is_pmd_fast_reflink(*pmd) || !vma->fast_reflink) + return; + + addr &= PMD_MASK; + fr_fixup_page_range(vma, addr, addr + PMD_SIZE); + VM_WARN_ON_ONCE(is_pmd_fast_reflink(*pmd)); + +#ifdef CONFIG_ARM64 + flush_tlb_range(vma, addr & PMD_MASK, (addr & PMD_MASK) + PMD_SIZE); +#endif +} + +static void fast_reflink_fixup(struct work_struct *work) +{ + struct fast_reflink_work *fr_work; + struct address_space *mapping; + struct vm_area_struct *vma; + + fr_work = container_of(work, struct fast_reflink_work, work); + mapping = fr_work->mapping; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) + fast_reflink_fixup_vma(vma); + i_mmap_unlock_read(mapping); +} diff --git a/mm/truncate.c b/mm/truncate.c index 8e3aa9e8618ed8262357edaf550a10425fa95828..126fe71bc6a14c3bd7c4d750a65190a3b056c244 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -469,6 +469,14 @@ void truncate_inode_pages_final(struct address_space *mapping) */ mapping_set_exiting(mapping); + /* Flush fast reflink work if any. */ + if (unlikely(mapping->fast_reflink_work)) { + flush_work(&mapping->fast_reflink_work->work); + + kfree(mapping->fast_reflink_work); + mapping->fast_reflink_work = NULL; + } + if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle