diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index e34cdeeeb9d420a673f5236ff59ec99242630a40..6cee984819b33fbf35face4b7dda99c85e55d4cf 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -97,6 +97,58 @@ Description:
 		indicates how many bytes the beginning of the device is
 		offset from the disk's natural alignment.
 
+What:		/sys/block/<disk>/atomic_write_max_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the maximum atomic write
+		size reported by the device. This parameter is relevant
+		for merging of writes, where a merged atomic write
+		operation must not exceed this number of bytes.
+		This parameter may be greater to the value in
+		atomic_write_unit_max_bytes as
+		atomic_write_unit_max_bytes will be rounded down to a
+		power-of-two and atomic_write_unit_max_bytes may also be
+		limited by some other queue limits, such as max_segments.
+		This parameter - along with atomic_write_unit_min_bytes
+		and atomic_write_unit_max_bytes - will not be larger than
+		max_hw_sectors_kb, but may be larger than max_sectors_kb.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_min_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the smallest block which can
+		be written atomically with an atomic write operation. All
+		atomic write operations must begin at a
+		atomic_write_unit_min boundary and must be multiples of
+		atomic_write_unit_min. This value must be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_max_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter defines the largest block which can be
+		written atomically with an atomic write operation. This
+		value must be a multiple of atomic_write_unit_min and must
+		be a power-of-two. This value will not be larger than
+		atomic_write_max_bytes.
+
+
+What:		/sys/block/<disk>/atomic_write_boundary_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] A device may need to internally split I/Os which
+		straddle a given logical block address boundary. In that
+		case a single atomic write operation will be processed as
+		one of more sub-operations which each complete atomically.
+		This parameter specifies the size in bytes of the atomic
+		boundary if one is reported by the device. This value must
+		be a power-of-two.
+
 What:		/sys/block/<disk>/<partition>/alignment_offset
 Date:		April 2009
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/block/blk-core.c b/block/blk-core.c
index f91f8e8be482d1c93809ddb936437db45c0d8c91..df6564d773e648f6e0d0b7f622d824f64fe5b8e9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -81,6 +81,7 @@ __setup("precise_iostat=", precise_iostat_setup);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *queue_atomic_write_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -205,6 +206,8 @@ static const struct {
 	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
 	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
 
+	[BLK_STS_INVAL]		= { -EINVAL,	"invalid" },
+
 	/* everything else not covered above: */
 	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
 };
@@ -530,6 +533,7 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue(int node_id)
 {
 	struct request_queue *q;
+	struct queue_atomic_write_limits *aw_limits;
 	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
@@ -537,11 +541,18 @@ struct request_queue *blk_alloc_queue(int node_id)
 	if (!q)
 		return NULL;
 
+	aw_limits = kmem_cache_alloc_node(queue_atomic_write_cachep,
+				GFP_KERNEL | __GFP_ZERO, node_id);
+	if (!aw_limits)
+		goto fail_q;
+
+	q->limits.aw_limits = aw_limits;
+
 	q->last_merge = NULL;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_aw;
 
 	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 	if (ret)
@@ -592,6 +603,7 @@ struct request_queue *blk_alloc_queue(int node_id)
 
 	blk_queue_dma_alignment(q, 511);
 	blk_set_default_limits(&q->limits);
+	blk_set_default_atomic_write_limits(&q->limits);
 	q->nr_requests = BLKDEV_MAX_RQ;
 
 	return q;
@@ -606,6 +618,8 @@ struct request_queue *blk_alloc_queue(int node_id)
 	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_aw:
+	kmem_cache_free(queue_atomic_write_cachep, aw_limits);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -819,6 +833,18 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
+static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
+						 struct bio *bio)
+{
+	if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
+		return BLK_STS_INVAL;
+
+	if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
+		return BLK_STS_INVAL;
+
+	return BLK_STS_OK;
+}
+
 static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct request_queue *q = bio->bi_disk->queue;
@@ -900,6 +926,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_WRITE:
+		if (bio->bi_opf & REQ_ATOMIC) {
+			status = blk_validate_atomic_write_op_size(q, bio);
+			if (status != BLK_STS_OK)
+				goto end_io;
+		}
+		break;
 	default:
 		break;
 	}
@@ -1155,7 +1188,7 @@ EXPORT_SYMBOL(submit_bio);
 static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
 				      struct request *rq)
 {
-	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+	unsigned int max_sectors = blk_queue_get_max_sectors(rq);
 
 	if (blk_rq_sectors(rq) > max_sectors) {
 		/*
@@ -1902,6 +1935,8 @@ int __init blk_dev_init(void)
 
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+	queue_atomic_write_cachep = kmem_cache_create("queue_atomic_write",
+			sizeof(struct queue_atomic_write_limits), 0, SLAB_PANIC, NULL);
 
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a65d1d275040d833bebe3279a5daa1e266e3016a..3b2004308e93f00a9d47806bcd49e01c83fc83c1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,6 +13,46 @@
 #include "blk.h"
 #include "blk-rq-qos.h"
 
+/*
+ * rq_straddles_atomic_write_boundary - check for boundary violation
+ * @rq: request to check
+ * @front: data size to be appended to front
+ * @back: data size to be appended to back
+ *
+ * Determine whether merging a request or bio into another request will result
+ * in a merged request which straddles an atomic write boundary.
+ *
+ * The value @front_adjust is the data which would be appended to the front of
+ * @rq, while the value @back_adjust is the data which would be appended to the
+ * back of @rq. Callers will typically only have either @front_adjust or
+ * @back_adjust as non-zero.
+ *
+ */
+static bool rq_straddles_atomic_write_boundary(struct request *rq,
+					unsigned int front_adjust,
+					unsigned int back_adjust)
+{
+	unsigned int boundary = queue_atomic_write_boundary_bytes(rq->q);
+	u64 mask, start_rq_pos, end_rq_pos;
+
+	if (!boundary)
+		return false;
+
+	start_rq_pos = blk_rq_pos(rq) << SECTOR_SHIFT;
+	end_rq_pos = start_rq_pos + blk_rq_bytes(rq) - 1;
+
+	start_rq_pos -= front_adjust;
+	end_rq_pos += back_adjust;
+
+	mask = ~(boundary - 1);
+
+	/* Top bits are different, so crossed a boundary */
+	if ((start_rq_pos & mask) != (end_rq_pos & mask))
+		return true;
+
+	return false;
+}
+
 static inline bool bio_will_gap(struct request_queue *q,
 		struct request *prev_rq, struct bio *prev, struct bio *next)
 {
@@ -145,11 +185,20 @@ static inline unsigned get_max_io_size(struct request_queue *q,
 				       struct bio *bio)
 {
 	unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
-	unsigned max_sectors = sectors;
+	unsigned max_sectors;
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
 	unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
 
+	/*
+	 * We ignore lim->max_sectors for atomic writes simply because
+	 * it may less than the bio size, which we cannot tolerate.
+	 */
+	if (bio->bi_opf & REQ_ATOMIC)
+		max_sectors = q->limits.aw_limits->atomic_write_max_sectors;
+	else
+		max_sectors = sectors;
+
 	max_sectors += start_offset;
 	max_sectors &= ~(pbs - 1);
 	if (max_sectors > start_offset)
@@ -278,6 +327,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	*segs = nsegs;
 	return NULL;
 split:
+	if (bio->bi_opf & REQ_ATOMIC) {
+		bio->bi_status = BLK_STS_INVAL;
+		bio_endio(bio);
+		return ERR_PTR(-EINVAL);
+	}
 	*segs = nsegs;
 	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
@@ -594,6 +648,13 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
 		return 0;
 	}
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				bio->bi_iter.bi_size, 0)) {
+			return 0;
+		}
+	}
+
 	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
@@ -613,6 +674,13 @@ static int ll_front_merge_fn(struct request *req, struct bio *bio,
 		return 0;
 	}
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				0, bio->bi_iter.bi_size)) {
+			return 0;
+		}
+	}
+
 	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
@@ -649,6 +717,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
 		return 0;
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				0, blk_rq_bytes(next))) {
+			return 0;
+		}
+	}
+
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
 	if (total_phys_segments > blk_rq_get_max_segments(req))
 		return 0;
@@ -721,6 +796,18 @@ static enum elv_merge blk_try_req_merge(struct request *req,
 	return ELEVATOR_NO_MERGE;
 }
 
+static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
+					      struct bio *bio)
+{
+	return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
+}
+
+static bool blk_atomic_write_mergeable_rqs(struct request *rq,
+					   struct request *next)
+{
+	return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -752,6 +839,9 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (req->ioprio != next->ioprio)
 		return NULL;
 
+	if (!blk_atomic_write_mergeable_rqs(req, next))
+		return NULL;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
@@ -895,6 +985,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->ioprio != bio_prio(bio))
 		return false;
 
+	if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
+		return false;
+
 	return true;
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c3aa7f8ee388357c7b96b50cd58c6591910bba84..8d378d62178c2f2f35ced6307d317e19fd583547 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -63,6 +63,20 @@ void blk_set_default_limits(struct queue_limits *lim)
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
+void blk_set_default_atomic_write_limits(struct queue_limits *lim)
+{
+	if (lim->aw_limits) {
+		lim->aw_limits->atomic_write_hw_max = 0;
+		lim->aw_limits->atomic_write_max_sectors = 0;
+		lim->aw_limits->atomic_write_hw_boundary = 0;
+		lim->aw_limits->atomic_write_hw_unit_min = 0;
+		lim->aw_limits->atomic_write_unit_min = 0;
+		lim->aw_limits->atomic_write_hw_unit_max = 0;
+		lim->aw_limits->atomic_write_unit_max = 0;
+	}
+}
+EXPORT_SYMBOL(blk_set_default_atomic_write_limits);
+
 /**
  * blk_set_stacking_limits - set default limits for stacking devices
  * @lim:  the queue_limits structure to reset
@@ -127,6 +141,44 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/*
+ * Returns max guaranteed bytes which we can fit in a bio.
+ *
+ * We always assume that we can fit in at least PAGE_SIZE in a segment, apart
+ * from first and last segments.
+ */
+static
+unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *limits)
+{
+	unsigned int max_segments = min((u16)BIO_MAX_PAGES, limits->max_segments);
+	unsigned int length;
+
+	length = min(max_segments, 2U) * limits->logical_block_size;
+	if (max_segments > 2)
+		length += (max_segments - 2) * PAGE_SIZE;
+
+	return length;
+}
+
+static void blk_atomic_writes_update_limits(struct queue_limits *limits)
+{
+	unsigned int unit_limit = min(limits->max_hw_sectors << SECTOR_SHIFT,
+					blk_queue_max_guaranteed_bio(limits));
+
+	unit_limit = rounddown_pow_of_two(unit_limit);
+
+	if (!limits->aw_limits)
+		return;
+
+	limits->aw_limits->atomic_write_max_sectors =
+		min(limits->aw_limits->atomic_write_hw_max >> SECTOR_SHIFT,
+			limits->max_hw_sectors);
+	limits->aw_limits->atomic_write_unit_min =
+		min(limits->aw_limits->atomic_write_hw_unit_min, unit_limit);
+	limits->aw_limits->atomic_write_unit_max =
+		min(limits->aw_limits->atomic_write_hw_unit_max, unit_limit);
+}
+
 /**
  * blk_queue_max_hw_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
@@ -161,6 +213,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
+
+	blk_atomic_writes_update_limits(limits);
+
 	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -196,6 +251,62 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
+/**
+ * blk_queue_atomic_write_max_bytes - set max bytes supported by
+ * the device for atomic write operations.
+ * @q:  the request queue for the device
+ * @bytes: maximum bytes supported
+ */
+void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+				      unsigned int bytes)
+{
+	q->limits.aw_limits->atomic_write_hw_max = bytes;
+	blk_atomic_writes_update_limits(&q->limits);
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes);
+
+/**
+ * blk_queue_atomic_write_boundary_bytes - Device's logical block address space
+ * which an atomic write should not cross.
+ * @q:  the request queue for the device
+ * @bytes: must be a power-of-two.
+ */
+void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
+					   unsigned int bytes)
+{
+	q->limits.aw_limits->atomic_write_hw_boundary = bytes;
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes);
+
+/**
+ * blk_queue_atomic_write_unit_min_bytes - smallest unit that can be written
+ * atomically to the device.
+ * @q:  the request queue for the device
+ * @bytes: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_min_bytes(struct request_queue *q,
+					     unsigned int bytes)
+{
+	q->limits.aw_limits->atomic_write_hw_unit_min = bytes;
+	blk_atomic_writes_update_limits(&q->limits);
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_bytes);
+
+/*
+ * blk_queue_atomic_write_unit_max_bytes - largest unit that can be written
+ * atomically to the device.
+ * @q: the request queue for the device
+ * @bytes: must be a power-of-two.
+ */
+void blk_queue_atomic_write_unit_max_bytes(struct request_queue *q,
+					     unsigned int bytes)
+{
+	q->limits.aw_limits->atomic_write_hw_unit_max = bytes;
+	blk_atomic_writes_update_limits(&q->limits);
+}
+EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_bytes);
+
+
 /**
  * blk_queue_max_write_same_sectors - set max sectors for a single write same
  * @q:  the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c95be9626a09807a07d8b5563540edd3e7a80de7..8e35d63b30c245efe8726eac0893d6b1fcd684a5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q,
 	return queue_var_show(queue_max_discard_segments(q), (page));
 }
 
+static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_max_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_boundary_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_boundary_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_min_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_max_bytes(q), page);
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, (page));
@@ -585,6 +609,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
 QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
 QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 
+QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
@@ -639,6 +668,10 @@ static struct attribute *queue_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_atomic_write_max_bytes_entry.attr,
+	&queue_atomic_write_boundary_entry.attr,
+	&queue_atomic_write_unit_min_entry.attr,
+	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
@@ -731,6 +764,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 					       rcu_head);
 
 	percpu_ref_exit(&q->q_usage_counter);
+	kmem_cache_free(queue_atomic_write_cachep, q->limits.aw_limits);
 	kmem_cache_free(blk_requestq_cachep, q);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 4bbcc971d4f734d5a03163748ed5924dbb4c5389..1bdd07040d42f6e8af87574f2223210aa604bc12 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *queue_atomic_write_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h
index 0eb23850afb954fc90a72cae9c252296b7fff06e..e41ce52ab8f1e99f0e6895364d857980521e0781 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.h
+++ b/drivers/block/rnbd/rnbd-srv-dev.h
@@ -66,8 +66,7 @@ static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev)
 	if (!blk_queue_discard(bdev_get_queue(dev->bdev)))
 		return 0;
 
-	return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev),
-					 REQ_OP_DISCARD);
+	return bdev_get_queue(dev->bdev)->limits.max_discard_sectors;
 }
 
 static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index eb95b5ce7b8fe2d2f124a848d45039de6379af9b..d8f54d2a74d9ccd0e07d001950e82bbeb4613323 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1816,6 +1816,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
+	limits->aw_limits = q->limits.aw_limits;
 	q->limits = *limits;
 
 	if (dm_table_supports_nowait(t))
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9fcc05c4f88cc5a1907ffcebe9e20e80e7949f4f..76626f5a737f50631c4aa9b1a91c96a343f8cba5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -752,6 +752,30 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 	return BLK_STS_OK;
 }
 
+static bool nvme_valid_atomic_write(struct request *req)
+{
+	struct request_queue *q = req->q;
+	u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
+
+	if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
+		return false;
+
+	if (boundary_bytes) {
+		u64 mask = boundary_bytes - 1, imask = ~mask;
+		u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
+		u64 end = start + blk_rq_bytes(req) - 1;
+
+		/* If greater then must be crossing a boundary */
+		if (blk_rq_bytes(req) > boundary_bytes)
+			return false;
+
+		if ((start & imask) != (end & imask))
+			return false;
+	}
+
+	return true;
+}
+
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_opcode op)
@@ -768,6 +792,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
+	/*
+	 * Ensure that nothing has been sent which cannot be executed
+	 * atomically.
+	 */
+	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
+		return BLK_STS_INVAL;
+
 	cmnd->rw.opcode = op;
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
@@ -2011,6 +2042,26 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
 	return 0;
 }
 
+static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
+			struct nvme_id_ns *id, struct queue_limits *lim,
+			u32 bs, u32 atomic_bs)
+{
+	unsigned int boundary = 0;
+
+	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+		if (le16_to_cpu(id->nabspf))
+			boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+	}
+
+	if (!lim->aw_limits)
+		return;
+
+	lim->aw_limits->atomic_write_hw_max = atomic_bs;
+	lim->aw_limits->atomic_write_hw_boundary = boundary;
+	lim->aw_limits->atomic_write_hw_unit_min = bs;
+	lim->aw_limits->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
+}
+
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -2060,6 +2111,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
 			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
 		else
 			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+
+		nvme_update_atomic_write_disk_info(ns, id, &disk->queue->limits,
+				bs, atomic_bs);
 	}
 
 	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@@ -2160,6 +2214,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 		goto out_unfreeze;
 	nvme_set_chunk_sectors(ns, id);
 	nvme_update_disk_info(ns->disk, ns, id);
+	nvme_set_queue_limits(ns->ctrl, ns->queue);
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
 	if (blk_queue_is_zoned(ns->queue)) {
@@ -2172,6 +2227,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
 	if (ns->head->disk) {
 		blk_mq_freeze_queue(ns->head->disk->queue);
 		nvme_update_disk_info(ns->head->disk, ns, id);
+		nvme_set_queue_limits(ns->ctrl, ns->head->disk->queue);
 		blk_stack_limits(&ns->head->disk->queue->limits,
 				 &ns->queue->limits, 0);
 		blk_queue_update_readahead(ns->head->disk->queue);
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c
index f1406ac77b0d595b3cae330e53646ee2d69e70c4..ea0a08a1696a018cddbd7d008bceb99fb0a492d7 100644
--- a/drivers/scsi/cxlflash/vlun.c
+++ b/drivers/scsi/cxlflash/vlun.c
@@ -430,8 +430,8 @@ static int write_same16(struct scsi_device *sdev,
 	struct device *dev = &cfg->dev->dev;
 	const u32 s = ilog2(sdev->sector_size) - 9;
 	const u32 to = sdev->request_queue->rq_timeout;
-	const u32 ws_limit = blk_queue_get_max_sectors(sdev->request_queue,
-						       REQ_OP_WRITE_SAME) >> s;
+	const u32 ws_limit =
+		sdev->request_queue->limits.max_write_zeroes_sectors >> s;
 
 	cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
 	scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
diff --git a/fs/aio.c b/fs/aio.c
index 00641a1ad0b3ff0b1d4df7f1c47e25a09a245068..78aaeaf354362bdde67715ceaa9843dfac6e67a6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1458,7 +1458,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
 {
 	int ret;
 
@@ -1485,7 +1485,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	} else
 		req->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
 	if (unlikely(ret))
 		return ret;
 
@@ -1537,7 +1537,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, READ);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1565,7 +1565,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, WRITE);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b9104fc0a395d29773d765b2bd863452e4488aba..84060c7d80a15179ac5c8a4dc223ddc9aee71a46 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -272,6 +272,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	bio.bi_end_io = blkdev_bio_end_io_simple;
 	bio.bi_ioprio = iocb->ki_ioprio;
 
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		bio.bi_opf |= REQ_ATOMIC;
+
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
 		goto out;
@@ -452,6 +455,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			bio->bi_opf = dio_bio_write_op(iocb);
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
+
+		if (iocb->ki_flags & IOCB_ATOMIC)
+			bio->bi_opf |= REQ_ATOMIC;
+
 		if (iocb->ki_flags & IOCB_NOWAIT)
 			bio->bi_opf |= REQ_NOWAIT;
 
@@ -521,14 +528,30 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	return ret;
 }
 
+static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos,
+				      struct iov_iter *iter)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q);
+	unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q);
+
+	return generic_atomic_write_valid(pos, iter, min_bytes, max_bytes);
+}
+
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	int nr_pages;
+	bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
+	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 
 	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
 	if (!nr_pages)
 		return 0;
+
+	if (is_atomic && !blkdev_atomic_write_valid(bdev, iocb->ki_pos, iter))
+		return -EINVAL;
+
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
@@ -1863,6 +1886,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
+	int err;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1888,7 +1912,11 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
 
-	return blkdev_get(bdev, filp->f_mode, filp);
+	err = blkdev_get(bdev, filp->f_mode, filp);
+	if (!err && bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
+		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
+	return err;
 }
 
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 25448d5827d259d63743e8cefc042b2d5338fe4e..d4778d251d41a6e5b505a2af6a3b6b9dd94d2ad3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/sched/signal.h>
 #include <linux/migrate.h>
+#include <linux/math64.h>
 #include "trace.h"
 
 #include "../internal.h"
@@ -1046,11 +1047,10 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 EXPORT_SYMBOL_GPL(iomap_zero_range);
 
 int
-iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops)
+iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize,
+		bool *did_zero, const struct iomap_ops *ops)
 {
-	unsigned int blocksize = i_blocksize(inode);
-	unsigned int off = pos & (blocksize - 1);
+	unsigned int off = rem_u64(pos, blocksize);
 
 	/* Block boundary? Nothing to do */
 	if (!off)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 892a4f8109e50d32df2b6cc55126156bae8c65fd..063ea9eb8afb21534f8afb1db5a4e0244621516a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -210,15 +210,22 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	struct page *page = ZERO_PAGE(0);
 	int flags = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
+	unsigned size;
+	unsigned nr_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(GFP_KERNEL, nr_pages);
 	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
+	while (len > 0) {
+		size = len > PAGE_SIZE ? PAGE_SIZE : len;
+		get_page(page);
+		__bio_add_page(bio, page, size, 0);
+		len -= size;
+		pos += size;
+	}
 	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
 	iomap_dio_submit_bio(dio, iomap, bio, pos);
 }
@@ -227,8 +234,9 @@ static loff_t
 iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		struct iomap_dio *dio, struct iomap *iomap)
 {
+	bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
 	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int zeroing_size, pad;
 	unsigned int align = iov_iter_alignment(dio->submit.iter);
 	struct bio *bio;
 	bool need_zeroout = false;
@@ -237,6 +245,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	size_t copied = 0;
 	size_t orig_count;
 
+	zeroing_size = i_blocksize(inode) << iomap->extent_shift;
+
 	if ((pos | length | align) & ((1 << blkbits) - 1))
 		return -EINVAL;
 
@@ -280,7 +290,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (zeroing_size - 1);
 		if (pad)
 			iomap_dio_zero(dio, iomap, pos - pad, pad);
 	}
@@ -298,6 +308,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
+
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -314,8 +325,16 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (is_atomic && (n != orig_count)) {
+			/* This bio should have covered the complete length */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto out;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (is_atomic)
+				bio->bi_opf |= REQ_ATOMIC;
 			if (use_fua)
 				bio->bi_opf |= REQ_FUA;
 			else
@@ -345,9 +364,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	if (need_zeroout ||
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
 		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (zeroing_size - 1);
 		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+			iomap_dio_zero(dio, iomap, pos, zeroing_size - pad);
 	}
 out:
 	/* Undo iter limitation to current extent */
diff --git a/fs/read_write.c b/fs/read_write.c
index 371a5a76f30e05d4728480012c07ec5c3bcb9661..da03b3e65cf3be6ab98bc26302e8ac9109ebef8f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	ret = kiocb_set_rw_flags(&kiocb, flags);
+	ret = kiocb_set_rw_flags(&kiocb, flags, type);
 	if (ret)
 		return ret;
 	kiocb.ki_pos = (ppos ? *ppos : 0);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 15e9e335d1672823c613f76fda6c926eb0cabd95..704d457eed9345f4ae8d618504b5de0a9884f0f8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3487,6 +3487,18 @@ xfs_bmap_btalloc(
 	args.fsbno = ap->blkno;
 	args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
 
+	/*
+	 * xfs_get_cowextsz_hint() returns extsz_hint for when forcealign is
+	 * set as forcealign and cowextsz_hint are mutually exclusive
+	 */
+	if (xfs_inode_forcealign(ap->ip) && align) {
+		args.alignment = align;
+		if (stripe_align == 0 || stripe_align % align)
+			stripe_align = align;
+	} else {
+		args.alignment = 1;
+	}
+
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);
 	blen = 0;
@@ -3577,7 +3589,6 @@ xfs_bmap_btalloc(
 				args.minalignslop = 0;
 		}
 	} else {
-		args.alignment = 1;
 		args.minalignslop = 0;
 	}
 	args.postallocs = 1;
@@ -3604,7 +3615,8 @@ xfs_bmap_btalloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
-	if (isaligned && args.fsbno == NULLFSBLOCK) {
+
+	if (isaligned && args.fsbno == NULLFSBLOCK && args.alignment <= 1) {
 		/*
 		 * allocation failed, so turn off alignment and
 		 * try again.
@@ -5276,6 +5288,12 @@ __xfs_bunmapi(
 	XFS_STATS_INC(mp, xs_blk_unmap);
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	end = start + len;
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1
+			&& S_ISREG(VFS_I(ip)->i_mode)) {
+		start = roundup_64(start, ip->i_d.di_extsize);
+		end = rounddown_64(end, ip->i_d.di_extsize);
+		len  = end - start;
+	}
 
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
 		*rlen = 0;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 54832df8540f85b706389dcf996cff6e1ad6f520..8e4d4959588410eb528d180be3943d3dbf49bb26 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,11 +353,17 @@ xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
+#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */
+
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
 		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
-		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT| \
+		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN| \
+		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -972,15 +978,20 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define XFS_DIFLAG2_FORCEALIGN_BIT 5
+#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
 
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
 #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
+#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_FORCEALIGN | XFS_DIFLAG2_ATOMICWRITES)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 0970ae3fe5382770e5e675ba88de009c5370a04b..dd9e5de65d52a48434278c6cc7aed8f9f20f7d9d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -574,6 +574,14 @@ xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		fa = xfs_inode_validate_forcealign(mp, mode, flags,
+				be32_to_cpu(dip->di_extsize),
+				be32_to_cpu(dip->di_cowextsize));
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
@@ -699,3 +707,35 @@ xfs_inode_validate_cowextsize(
 
 	return NULL;
 }
+
+/* Validate the forcealign inode flag */
+xfs_failaddr_t
+xfs_inode_validate_forcealign(
+	struct xfs_mount	*mp,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint32_t		extsize,
+	uint32_t		cowextsize)
+{
+	/* superblock rocompat feature flag */
+	if (!xfs_has_forcealign(mp))
+		return __this_address;
+
+	/* Only regular files and directories */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* Doesn't apply to realtime files */
+	if (flags & XFS_DIFLAG_REALTIME)
+		return __this_address;
+
+	/* Requires a non-zero power-of-2 extent size hint */
+	if (extsize == 0 || !is_power_of_2(extsize))
+		return __this_address;
+
+	/* Requires no cow extent size hint */
+	if (cowextsize != 0)
+		return __this_address;
+
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 05c3640e135a29a3b4f207769095b44ff045c398..1bcf1415a4b5f5c8ec1333b6f552228a70cddcf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -62,6 +62,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
 		uint32_t cowextsize, uint16_t mode, uint16_t flags,
 		uint64_t flags2);
+xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
+		uint16_t mode, uint16_t flags, uint32_t extsize,
+		uint32_t cowextsize);
 
 static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index c099ccf2787dbdefeed8584389c51ac40ec146b3..a4354504986ce61be4c528e7205284268abf21ac 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -116,6 +116,10 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_REFLINK;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
 		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+		features |= XFS_FEAT_FORCEALIGN;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+		features |= XFS_FEAT_ATOMICWRITES;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index edf62092125c62a932fe846f8e3492e464b86ccd..5879f03b8660809f1be3945d55cf040ad71e56d8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -654,6 +654,9 @@ xfs_free_eofblocks(
 	 * of the file.  If not, then there is nothing to do.
 	 */
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+	/* Do not free blocks when forcing extent sizes */
+	if (xfs_get_extsz(ip) > 1)
+		end_fsb = roundup_64(end_fsb, xfs_get_extsz(ip));
 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	if (last_fsb <= end_fsb)
 		return 0;
@@ -925,8 +928,11 @@ xfs_free_file_space(
 	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
-	/* We can only free complete realtime extents. */
-	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+	/* Free only complete extents. */
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) {
+		startoffset_fsb = roundup_64(startoffset_fsb, ip->i_d.di_extsize);
+		endoffset_fsb = rounddown_64(endoffset_fsb, ip->i_d.di_extsize);
+	} else if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
 		startoffset_fsb = roundup_64(startoffset_fsb,
 					     mp->m_sb.sb_rextsize);
 		endoffset_fsb = rounddown_64(endoffset_fsb,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52643eac5d46ee692b1d171eec8effaa511e503c..10a829ffd908d131169b16e753aa5c3a4651152f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -60,7 +60,10 @@ xfs_is_falloc_aligned(
 		}
 		mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
 	} else {
-		mask = mp->m_sb.sb_blocksize - 1;
+		if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1)
+			mask = (mp->m_sb.sb_blocksize * ip->i_d.di_extsize) - 1;
+		else
+			mask = mp->m_sb.sb_blocksize - 1;
 	}
 
 	return !((pos | len) & mask);
@@ -586,6 +589,14 @@ xfs_file_dio_aio_write(
 	size_t			count = iov_iter_count(from);
 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 
+	if (iocb->ki_flags & IOCB_ATOMIC) {
+		if (!generic_atomic_write_valid(iocb->ki_pos, from,
+			i_blocksize(inode),
+			XFS_FSB_TO_B(mp, xfs_get_extsz(ip)))) {
+			return -EINVAL;
+		}
+	}
+
 	/* DIO must be aligned to device logical sector size */
 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
@@ -597,8 +608,8 @@ xfs_file_dio_aio_write(
 	 * the inode as necessary for EOF zeroing cases and fill out the new
 	 * inode size as appropriate.
 	 */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
+	if ((iocb->ki_pos & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1)) ||
+	    ((iocb->ki_pos + count) & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1))) {
 		unaligned_io = 1;
 
 		/*
@@ -803,6 +814,9 @@ xfs_file_write_iter(
 		return xfs_file_dax_write(iocb, from);
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
+
+		if (xfs_inode_atomicwrites(ip))
+			iocb->ki_flags |= IOCB_ATOMIC;
 		/*
 		 * Allow a directio write to fall back to a buffered
 		 * write *only* in the case that we're doing a reflink
@@ -1179,6 +1193,25 @@ xfs_file_remap_range(
 	return remapped > 0 ? remapped : ret;
 }
 
+static bool xfs_file_open_can_atomicwrite(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+
+	if (!(file->f_flags & O_DIRECT))
+		return false;
+
+	if (!xfs_inode_atomicwrites(ip))
+		return false;
+
+	if (!bdev_can_atomic_write(target->bt_bdev))
+		return false;
+
+	return true;
+}
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -1189,6 +1222,8 @@ xfs_file_open(
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+	if (xfs_file_open_can_atomicwrite(inode, file))
+		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 268bbc2d978b8cb2d4d052905dfe0d63f7d9ace7..80e1503ca8c4102343b91e35722d7e9b8b3c6500 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -68,6 +68,20 @@ xfs_get_extsz_hint(
 	return 0;
 }
 
+/*
+ * Helper function to extract extent size. It will return a power-of-2,
+ * as forcealign requires this.
+ */
+xfs_extlen_t
+xfs_get_extsz(
+	struct xfs_inode	*ip)
+{
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize)
+		return ip->i_d.di_extsize;
+
+	return 1;
+}
+
 /*
  * Helper function to extract CoW extent size hint from inode.
  * Between the extent size hint and the CoW extent size hint, we
@@ -643,6 +657,10 @@ _xfs_dic2xflags(
 			flags |= FS_XFLAG_DAX;
 		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
 			flags |= FS_XFLAG_COWEXTSIZE;
+		if (di_flags2 & XFS_DIFLAG2_FORCEALIGN)
+			flags |= FS_XFLAG_FORCEALIGN;
+		if (di_flags2 & XFS_DIFLAG2_ATOMICWRITES)
+			flags |= FS_XFLAG_ATOMICWRITES;
 	}
 
 	if (has_attr)
@@ -759,6 +777,18 @@ xfs_inode_inherit_flags2(
 	}
 	if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 		ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN)
+		ip->i_d.di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
+
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		xfs_failaddr_t		failaddr;
+
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, ip->i_d.di_flags, ip->i_d.di_extsize,
+				ip->i_d.di_cowextsize);
+		if (failaddr)
+			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_FORCEALIGN;
+	}
 }
 
 /*
@@ -3956,3 +3986,16 @@ xfs_iunlock2_io_mmap(
 	if (!same_inode)
 		inode_unlock(VFS_I(ip1));
 }
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+		struct xfs_inode        *ip)
+{
+	unsigned int            blocks = 1;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		blocks = ip->i_mount->m_sb.sb_rextsize;
+
+	return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b552daae323fcdcac75abddcc069285ab46f5cbd..818f7622d8518a7eaa8c76170591110fe0484059 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -268,6 +268,16 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 	return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME;
 }
 
+static inline bool xfs_inode_forcealign(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN;
+}
+
+static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_ATOMICWRITES;
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
@@ -489,6 +499,7 @@ void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_extsz(struct xfs_inode *ip);
 xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
@@ -560,5 +571,6 @@ void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2337eb272235407a7e2f7b927097a52c6b8fb516..939b91124ce12677d8bb5dcfcca5a60558c0413a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1198,6 +1198,10 @@ xfs_flags2diflags2(
 		di_flags2 |= XFS_DIFLAG2_DAX;
 	if (xflags & FS_XFLAG_COWEXTSIZE)
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	if (xflags & FS_XFLAG_FORCEALIGN)
+		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
+	if (xflags & FS_XFLAG_ATOMICWRITES)
+		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
 
 	return di_flags2;
 }
@@ -1210,10 +1214,12 @@ xfs_ioctl_setattr_xflags(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	uint64_t		di_flags2;
+	bool			atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
 
-	/* Can't change realtime flag if any extents are allocated. */
+	/* Can't change realtime or atomic flag if any extents are allocated. */
 	if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
-	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
+	    (XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME) ||
+	     atomic_writes != xfs_inode_atomicwrites(ip)))
 		return -EINVAL;
 
 	/* If realtime flag is set then must have realtime device */
@@ -1236,6 +1242,29 @@ xfs_ioctl_setattr_xflags(
 	if (di_flags2 && !xfs_has_v3inodes(mp))
 		return -EINVAL;
 
+	/*
+	 * Force-align requires a nonzero extent size hint and a zero cow
+	 * extent size hint.  It doesn't apply to realtime files.
+	 */
+	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
+		if (!xfs_has_forcealign(mp))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
+			return -EINVAL;
+		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
+					FS_XFLAG_EXTSZINHERIT)))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
+			return -EINVAL;
+	}
+
+	if (atomic_writes) {
+		if (!xfs_has_atomicwrites(mp))
+			return -EINVAL;
+		if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN))
+			return -EINVAL;
+	}
+
 	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_d.di_flags2 = di_flags2;
 
@@ -1339,6 +1368,9 @@ xfs_ioctl_setattr_check_extsize(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_extlen_t		size;
 	xfs_fsblock_t		extsize_fsb;
+	xfs_failaddr_t		failaddr;
+	uint16_t		new_diflags;
+	uint16_t		new_diflags2;
 
 	if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
 	    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
@@ -1363,6 +1395,17 @@ xfs_ioctl_setattr_check_extsize(
 	if (fa->fsx_extsize % size)
 		return -EINVAL;
 
+	new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, new_diflags,
+				XFS_B_TO_FSB(mp, fa->fsx_extsize),
+				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
+		if (failaddr)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -1606,6 +1649,10 @@ xfs_ioc_setxflags(
 	}
 
 	xfs_fill_fsxattr(ip, false, &old_fa);
+	fa.fsx_extsize = old_fa.fsx_extsize;
+	fa.fsx_cowextsize = old_fa.fsx_cowextsize;
+	fa.fsx_projid = old_fa.fsx_projid;
+	fa.fsx_nextents = old_fa.fsx_nextents;
 	error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
 	if (error) {
 		xfs_trans_cancel(tp);
@@ -2069,6 +2116,26 @@ xfs_fs_eofblocks_from_user(
 	return 0;
 }
 
+static int
+xfs_ioc_set_atomic_write(
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_ioctl_setattr_get_trans(ip, NULL);
+	if (IS_ERR(tp)) {
+		error = PTR_ERR(tp);
+		goto out;
+	}
+
+	ip->i_d.di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
+
+	error = xfs_trans_commit(tp);
+out:
+	return error;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -2096,6 +2163,31 @@ xfs_file_ioctl(
 		return xfs_ioc_getlabel(mp, arg);
 	case FS_IOC_SETFSLABEL:
 		return xfs_ioc_setlabel(filp, mp, arg);
+	case FS_IOC_SETATOMIC:
+		if (!xfs_has_atomicwrites(mp))
+			return -1;
+		if (!S_ISREG(inode->i_mode))
+			return -1;
+		if (xfs_inode_atomicwrites(ip))
+			return 0;
+		if (!xfs_inode_forcealign(ip))
+			return -1;
+
+		xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+		error = xfs_ioc_set_atomic_write(ip);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+		if (error) {
+			xfs_alert(mp, "%s: set ino 0x%llx atomic write fail!",
+					__func__, XFS_I(inode)->i_ino);
+			return -1;
+		} else {
+			struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+
+			if ((filp->f_flags & O_DIRECT) &&
+			    bdev_can_atomic_write(target->bt_bdev))
+				filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+			return 0;
+		}
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_ALLOCSP64:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 76285db4aaec4a642901134d447a761bb32d010b..dc01689988ab68baf060aaf3ca768c7814e59407 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -90,6 +90,7 @@ xfs_bmbt_to_iomap(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	xfs_extlen_t		extsz = xfs_get_extsz(ip);
 
 	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
 		return xfs_alert_fsblock_zero(ip, imap);
@@ -120,6 +121,8 @@ xfs_bmbt_to_iomap(
 
 	iomap->validity_cookie = sequence_cookie;
 	iomap->page_ops = &xfs_iomap_page_ops;
+	if (extsz > 1)
+		iomap->extent_shift = ffs(extsz) - 1;
 	return 0;
 }
 
@@ -167,7 +170,9 @@ xfs_eof_alignment(
 		 * If mounted with the "-o swalloc" option the alignment is
 		 * increased from the strip unit size to the stripe width.
 		 */
-		if (mp->m_swidth && xfs_has_swalloc(mp))
+		if (xfs_inode_forcealign(ip))
+			align = xfs_get_extsz_hint(ip);
+		else if (mp->m_swidth && xfs_has_swalloc(mp))
 			align = mp->m_swidth;
 		else if (mp->m_dalign)
 			align = mp->m_dalign;
@@ -544,11 +549,19 @@ xfs_iomap_write_unwritten(
 	xfs_fsize_t	i_size;
 	uint		resblks;
 	int		error;
+	xfs_extlen_t	extsz = xfs_get_extsz(ip);
 
 	trace_xfs_unwritten_convert(ip, offset, count);
 
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	if (extsz > 1) {
+		xfs_extlen_t extsize_bytes = XFS_FSB_TO_B(mp, extsz);
+
+		offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes));
+		count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes));
+	} else {
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	}
 	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
 
 	/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a527a544a684170dd3ce57c1ddf09ddb0b0dfcc5..b04f7ca882d452e8c5e6163e4db7755bc09c3044 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -769,6 +769,7 @@ xfs_setattr_size(
 	int			error;
 	uint			lock_flags = 0;
 	bool			did_zeroing = false;
+	bool                    write_back = false;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
@@ -805,21 +806,10 @@ xfs_setattr_size(
 	 */
 	inode_dio_wait(inode);
 
-	/*
-	 * File data changes must be complete before we start the transaction to
-	 * modify the inode.  This needs to be done before joining the inode to
-	 * the transaction because the inode cannot be unlocked once it is a
-	 * part of the transaction.
-	 *
-	 * Start with zeroing any data beyond EOF that we may expose on file
-	 * extension, or zeroing out the rest of the block on a downward
-	 * truncate.
-	 */
-	if (newsize > oldsize) {
-		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
-		error = iomap_zero_range(inode, oldsize, newsize - oldsize,
-				&did_zeroing, &xfs_buffered_write_iomap_ops);
-	} else {
+	write_back = newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size;
+	if (newsize < oldsize) {
+		unsigned int	blocksize = xfs_inode_alloc_unitsize(ip);
+
 		/*
 		 * iomap won't detect a dirty page over an unwritten block (or a
 		 * cow block over a hole) and subsequently skips zeroing the
@@ -827,53 +817,67 @@ xfs_setattr_size(
 		 * convert the block before the pagecache truncate.
 		 */
 		error = filemap_write_and_wait_range(inode->i_mapping, newsize,
-						     newsize);
+					roundup_64(newsize, blocksize) - 1);
 		if (error)
 			return error;
-		error = iomap_truncate_page(inode, newsize, &did_zeroing,
-				&xfs_buffered_write_iomap_ops);
-	}
+		error = iomap_truncate_page(inode, newsize, blocksize,
+				&did_zeroing, &xfs_buffered_write_iomap_ops);
 
-	if (error)
-		return error;
+		/*
+		 * We are going to log the inode size change in this transaction
+		 * so any previous writes that are beyond the on disk EOF and
+		 * the new EOF that have not been written out need to be written
+		 * here.  If we do not write the data out, we expose ourselves
+		 * to the null files problem. Note that this includes any block
+		 * zeroing we did above; otherwise those blocks may not be
+		 * zeroed after a crash.
+		 */
+		if (did_zeroing || write_back) {
+			error = filemap_write_and_wait_range(inode->i_mapping,
+					min_t(loff_t, ip->i_d.di_size, newsize),
+					roundup_64(newsize, blocksize) - 1);
+			if (error)
+				return error;
+		}
 
-	/*
-	 * We've already locked out new page faults, so now we can safely remove
-	 * pages from the page cache knowing they won't get refaulted until we
-	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
-	 * complete. The truncate_setsize() call also cleans partial EOF page
-	 * PTEs on extending truncates and hence ensures sub-page block size
-	 * filesystems are correctly handled, too.
-	 *
-	 * We have to do all the page cache truncate work outside the
-	 * transaction context as the "lock" order is page lock->log space
-	 * reservation as defined by extent allocation in the writeback path.
-	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
-	 * having already truncated the in-memory version of the file (i.e. made
-	 * user visible changes). There's not much we can do about this, except
-	 * to hope that the caller sees ENOMEM and retries the truncate
-	 * operation.
-	 *
-	 * And we update in-core i_size and truncate page cache beyond newsize
-	 * before writeback the [di_size, newsize] range, so we're guaranteed
-	 * not to write stale data past the new EOF on truncate down.
-	 */
-	truncate_setsize(inode, newsize);
+		/*
+		 * Updating i_size after writing back to make sure the zeroed
+		 * blocks could been written out, and drop all the page cache
+		 * range that beyond blocksize aligned new EOF block.
+		 *
+		 * We've already locked out new page faults, so now we can
+		 * safely remove pages from the page cache knowing they won't
+		 * get refaulted until we drop the XFS_MMAP_EXCL lock after the
+		 * extent manipulations are complete.
+		 */
+		i_size_write(inode, newsize);
+		truncate_pagecache(inode, roundup_64(newsize, blocksize));
+	} else {
+		/*
+		 * Start with zeroing any data beyond EOF that we may expose on
+		 * file extension.
+		 */
+		if (newsize > oldsize) {
+			trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
+			error = iomap_zero_range(inode, oldsize, newsize - oldsize,
+					&did_zeroing, &xfs_buffered_write_iomap_ops);
+			if (error)
+				return error;
+		}
 
-	/*
-	 * We are going to log the inode size change in this transaction so
-	 * any previous writes that are beyond the on disk EOF and the new
-	 * EOF that have not been written out need to be written here.  If we
-	 * do not write the data out, we expose ourselves to the null files
-	 * problem. Note that this includes any block zeroing we did above;
-	 * otherwise those blocks may not be zeroed after a crash.
-	 */
-	if (did_zeroing ||
-	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
-		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-						ip->i_d.di_size, newsize - 1);
-		if (error)
-			return error;
+		/*
+		 * The truncate_setsize() call also cleans partial EOF page
+		 * PTEs on extending truncates and hence ensures sub-page block
+		 * size filesystems are correctly handled, too.
+		 */
+		truncate_setsize(inode, newsize);
+
+		if (did_zeroing || write_back) {
+			error = filemap_write_and_wait_range(inode->i_mapping,
+					ip->i_d.di_size, newsize - 1);
+			if (error)
+				return error;
+		}
 	}
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 21547ff97b5a6782fae4c06f1b2fdec74acf2748..888d6bf9bea7adf2dd30ba73b74e7b2238ce7ed8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -274,6 +274,9 @@ typedef struct xfs_mount {
 #define XFS_FEAT_INOBTCNT	(1ULL << 23)	/* inobt block counts */
 #define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
+#define XFS_FEAT_FORCEALIGN	(1ULL << 27)	/* aligned file data extents */
+#define XFS_FEAT_ATOMICWRITES	(1ULL << 28)	/* atomic writes support */
+
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -336,6 +339,8 @@ __XFS_HAS_FEAT(realtime, REALTIME)
 __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
 __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(forcealign, FORCEALIGN)
+__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 502fb08bfd3812fafea2775a24be40b24da83872..d43f76a4b99a433f17697cf44ed2507cef076d35 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1639,6 +1639,7 @@ xfs_fc_fill_super(
 			"DAX unsupported by block device. Turning off DAX.");
 			xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
 		}
+
 		if (xfs_has_reflink(mp)) {
 			xfs_alert(mp,
 		"DAX and reflink cannot be used together!");
@@ -1657,6 +1658,14 @@ xfs_fc_fill_super(
 		}
 	}
 
+	if (xfs_has_forcealign(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
+
+	if (xfs_has_atomicwrites(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!");
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1853ec569b72d2d6dd52f4f8730f9ac2ebf77370..996e03d50f41ae609e1491f6de6e1c2876ffca8f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -133,6 +133,11 @@ typedef u8 __bitwise blk_status_t;
  */
 #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
 
+/*
+ * Invalid size or alignment.
+ */
+#define BLK_STS_INVAL			((__force blk_status_t)19)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
@@ -421,6 +426,7 @@ enum req_flag_bits {
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
+	__REQ_ATOMIC,		/* for atomic write operations */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -445,6 +451,7 @@ enum req_flag_bits {
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
+#define REQ_ATOMIC		(1ULL << __REQ_ATOMIC)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50b4fd0a06873b1cb196ddf37be26e343c5c7ea4..31e8e98d499096c81b482389151e613a8f31cb67 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -323,6 +323,17 @@ enum blk_zoned_model {
 	BLK_ZONED_HM,		/* Host-managed zoned block device */
 };
 
+struct queue_atomic_write_limits {
+	/* atomic write limits */
+	unsigned int		atomic_write_hw_max;
+	unsigned int		atomic_write_max_sectors;
+	unsigned int		atomic_write_hw_boundary;
+	unsigned int		atomic_write_hw_unit_min;
+	unsigned int		atomic_write_unit_min;
+	unsigned int		atomic_write_hw_unit_max;
+	unsigned int		atomic_write_unit_max;
+};
+
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
@@ -355,7 +366,11 @@ struct queue_limits {
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 
+#ifndef __GENKSYMS__
+	struct queue_atomic_write_limits *aw_limits;
+#else
 	KABI_RESERVE(1)
+#endif
 };
 
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
@@ -1082,9 +1097,11 @@ static inline struct bio_vec req_bvec(struct request *rq)
 	return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
 }
 
-static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
-						     int op)
+static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
 {
+	struct request_queue *q = rq->q;
+	int op = req_op(rq);
+
 	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors,
 			   UINT_MAX >> SECTOR_SHIFT);
@@ -1095,6 +1112,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	if (unlikely(op == REQ_OP_WRITE_ZEROES))
 		return q->limits.max_write_zeroes_sectors;
 
+	if (rq->cmd_flags & REQ_ATOMIC)
+		return q->limits.aw_limits->atomic_write_max_sectors;
+
 	return q->limits.max_sectors;
 }
 
@@ -1132,10 +1152,10 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	if (!q->limits.chunk_sectors ||
 	    req_op(rq) == REQ_OP_DISCARD ||
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
-		return blk_queue_get_max_sectors(q, req_op(rq));
+		return blk_queue_get_max_sectors(rq);
 
 	return min(blk_max_size_offset(q, offset, 0),
-			blk_queue_get_max_sectors(q, req_op(rq)));
+			blk_queue_get_max_sectors(rq));
 }
 
 static inline unsigned int blk_rq_count_bios(struct request *rq)
@@ -1185,6 +1205,14 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
+void blk_queue_atomic_write_max_bytes(struct request_queue *q,
+				unsigned int bytes);
+void blk_queue_atomic_write_boundary_bytes(struct request_queue *q,
+				unsigned int bytes);
+void blk_queue_atomic_write_unit_max_bytes(struct request_queue *q,
+				unsigned int bytes);
+void blk_queue_atomic_write_unit_min_bytes(struct request_queue *q,
+				unsigned int bytes);
 void blk_queue_update_readahead(struct request_queue *q);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
@@ -1192,6 +1220,7 @@ extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_default_limits(struct queue_limits *lim);
+extern void blk_set_default_atomic_write_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
@@ -1647,6 +1676,30 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int
+queue_atomic_write_unit_max_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_unit_max;
+}
+
+static inline unsigned int
+queue_atomic_write_unit_min_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_unit_min;
+}
+
+static inline unsigned int
+queue_atomic_write_boundary_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_hw_boundary;
+}
+
+static inline unsigned int
+queue_atomic_write_max_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_max_sectors << SECTOR_SHIFT;
+}
+
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
@@ -2100,4 +2153,24 @@ int fsync_bdev(struct block_device *bdev);
 struct super_block *freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 
+static inline bool bdev_can_atomic_write(struct block_device *bdev)
+{
+	struct request_queue *bd_queue = bdev_get_queue(bdev);
+	struct queue_limits *limits = &bd_queue->limits;
+
+	if (!limits->aw_limits->atomic_write_unit_min)
+		return false;
+
+	if (bdev_is_partition(bdev)) {
+		sector_t bd_start_sect = bdev->bd_part->start_sect;
+		unsigned int alignment =
+			max(limits->aw_limits->atomic_write_unit_min,
+			    limits->aw_limits->atomic_write_hw_boundary);
+		if (!IS_ALIGNED(bd_start_sect, alignment))
+			return false;
+	}
+
+	return true;
+}
+
 #endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 382a0d4dd3dd3a4b9e3cfa4925bf1ee104615f8b..5e37ba8550834040a41706981851c55ce1a2168a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -41,6 +41,7 @@
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 #include <linux/mount.h>
+#include <linux/uio.h>
 #include <linux/tracepoint-defs.h>
 
 #include <asm/byteorder.h>
@@ -184,6 +185,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File supports async buffered reads */
 #define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
 
+/* File supports atomic writes */
+#define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)0x80000000)
+
 /* File mode control flag, expect random access pattern */
 #define FMODE_CTL_RANDOM	((__force fmode_t)0x1000)
 
@@ -320,6 +324,7 @@ enum rw_hint {
 #define IOCB_SYNC		(__force int) RWF_SYNC
 #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
 #define IOCB_APPEND		(__force int) RWF_APPEND
+#define IOCB_ATOMIC		(__force int) RWF_ATOMIC
 
 /* non-RWF related bits - start at 16 */
 #define IOCB_EVENTFD		(1 << 16)
@@ -3406,7 +3411,8 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
+				     int rw_type)
 {
 	int kiocb_flags = 0;
 
@@ -3423,6 +3429,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		kiocb_flags |= IOCB_NOIO;
 	}
+	if (flags & RWF_ATOMIC) {
+		if (rw_type != WRITE)
+			return -EOPNOTSUPP;
+		if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+			return -EOPNOTSUPP;
+	}
 	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
@@ -3665,4 +3677,23 @@ static inline void fs_file_read_do_trace(struct kiocb *iocb)
 	if (tracepoint_enabled(fs_file_read))
 		fs_file_read_update_args_by_trace(iocb);
 }
+
+static inline
+bool generic_atomic_write_valid(loff_t pos, struct iov_iter *iter,
+			unsigned int unit_min, unsigned int unit_max)
+{
+	size_t len = iov_iter_count(iter);
+
+	if (len < unit_min || len > unit_max)
+		return false;
+
+	if (!is_power_of_2(len))
+		return false;
+
+	if (!IS_ALIGNED(pos, len))
+		return false;
+
+	return true;
+}
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0965d5f12858e3d9546d0baa98ac5f128dd604f1..1b6e22741d43002e8ecbad1bc980da64101eb858 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -93,6 +93,7 @@ struct iomap {
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
+	unsigned int		extent_shift;
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;
@@ -206,8 +207,8 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
-int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize,
+		bool *did_zero, const struct iomap_ops *ops);
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 			const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 66deb1fdc2ef641d1c4a917c212585be80d94751..b5c4d1df08e522248bca845f84f00c039f09ab4e 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -3,6 +3,7 @@
 #define _LINUX_MATH64_H
 
 #include <linux/types.h>
+#include <linux/log2.h>
 #include <vdso/math64.h>
 #include <asm/div64.h>
 
@@ -11,6 +12,20 @@
 #define div64_long(x, y) div64_s64((x), (y))
 #define div64_ul(x, y)   div64_u64((x), (y))
 
+/**
+ * rem_u64 - remainder of unsigned 64bit divide with 32bit divisor
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 32bit divisor
+ *
+ * Return: dividend % divisor
+ */
+static inline u32 rem_u64(u64 dividend, u32 divisor)
+{
+	if (is_power_of_2(divisor))
+		return dividend & (divisor - 1);
+	return dividend % divisor;
+}
+
 /**
  * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
  * @dividend: unsigned 64bit dividend
@@ -85,6 +100,15 @@ static inline s64 div64_s64(s64 dividend, s64 divisor)
 #define div64_long(x, y) div_s64((x), (y))
 #define div64_ul(x, y)   div_u64((x), (y))
 
+#ifndef rem_u64
+static inline u32 rem_u64(u64 dividend, u32 divisor)
+{
+	if (is_power_of_2(divisor))
+		return dividend & (divisor - 1);
+	return do_div(dividend, divisor);
+}
+#endif
+
 #ifndef div_u64_rem
 static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afdd8cea369af1395c3637a5f69122d..332b0709756b01e60a6d6480b1f0aca345d92d67 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -140,6 +140,9 @@ struct fsxattr {
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define FS_XFLAG_FORCEALIGN	0x00020000
+#define FS_XFLAG_ATOMICWRITES	0x00040000	/* atomic writes enabled */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is
@@ -214,6 +217,7 @@ struct fsxattr {
 #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
 #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
 #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
+#define FS_IOC_SETATOMIC		_IOW(0x95, 2, uint)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
@@ -300,8 +304,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* Atomic Write */
+#define RWF_ATOMIC	((__force __kernel_rwf_t)0x00000040)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ATOMIC)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 65cf70874fb3ea14728ea1666cbb71ed8e6eb486..c284e9865826946e275eaea8006919e0cfcdc273 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2956,7 +2956,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags), rw);
 	if (unlikely(ret))
 		return ret;