diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index e34cdeeeb9d420a673f5236ff59ec99242630a40..6cee984819b33fbf35face4b7dda99c85e55d4cf 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -97,6 +97,58 @@ Description:
 		indicates how many bytes the beginning of the device is
 		offset from the disk's natural alignment.
 
+What:		/sys/block/<disk>/atomic_write_max_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the maximum atomic write
+		size reported by the device. This parameter is relevant
+		for merging of writes, where a merged atomic write
+		operation must not exceed this number of bytes.
+		This parameter may be greater to the value in
+		atomic_write_unit_max_bytes as
+		atomic_write_unit_max_bytes will be rounded down to a
+		power-of-two and atomic_write_unit_max_bytes may also be
+		limited by some other queue limits, such as max_segments.
+		This parameter - along with atomic_write_unit_min_bytes
+		and atomic_write_unit_max_bytes - will not be larger than
+		max_hw_sectors_kb, but may be larger than max_sectors_kb.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_min_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter specifies the smallest block which can
+		be written atomically with an atomic write operation. All
+		atomic write operations must begin at a
+		atomic_write_unit_min boundary and must be multiples of
+		atomic_write_unit_min. This value must be a power-of-two.
+
+
+What:		/sys/block/<disk>/atomic_write_unit_max_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] This parameter defines the largest block which can be
+		written atomically with an atomic write operation. This
+		value must be a multiple of atomic_write_unit_min and must
+		be a power-of-two. This value will not be larger than
+		atomic_write_max_bytes.
+
+
+What:		/sys/block/<disk>/atomic_write_boundary_bytes
+Date:		February 2024
+Contact:	Himanshu Madhani <himanshu.madhani@oracle.com>
+Description:
+		[RO] A device may need to internally split I/Os which
+		straddle a given logical block address boundary. In that
+		case a single atomic write operation will be processed as
+		one of more sub-operations which each complete atomically.
+		This parameter specifies the size in bytes of the atomic
+		boundary if one is reported by the device. This value must
+		be a power-of-two.
+
 What:		/sys/block/<disk>/<partition>/alignment_offset
 Date:		April 2009
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/block/blk-core.c b/block/blk-core.c
index a1ebbf96d19af7d6300723b0666877626d475fd9..e3e2659d067358110c6ce6dbc189ccf31b223434 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -81,6 +81,7 @@ __setup("precise_iostat=", precise_iostat_setup);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *queue_atomic_write_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -433,6 +434,8 @@ static const struct {
 	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
 	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
 
+	[BLK_STS_INVAL]		= { -EINVAL,	"invalid" },
+
 	/* everything else not covered above: */
 	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
 };
@@ -758,6 +761,7 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue(int node_id)
 {
 	struct request_queue *q;
+	struct queue_atomic_write_limits *aw_limits;
 	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
@@ -765,10 +769,17 @@ struct request_queue *blk_alloc_queue(int node_id)
 	if (!q)
 		return NULL;
 
+	aw_limits = kmem_cache_alloc_node(queue_atomic_write_cachep,
+				GFP_KERNEL | __GFP_ZERO, node_id);
+	if (!aw_limits)
+		goto fail_q;
+
+	q->limits.aw_limits = aw_limits;
+
 	q->last_merge = NULL;
 
 	if (blk_alloc_queue_dispatch_async(q))
-		goto fail_q;
+		goto fail_aw;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
 	if (q->id < 0)
@@ -823,6 +834,7 @@ struct request_queue *blk_alloc_queue(int node_id)
 
 	blk_queue_dma_alignment(q, 511);
 	blk_set_default_limits(&q->limits);
+	blk_set_default_atomic_write_limits(&q->limits);
 	q->nr_requests = BLKDEV_MAX_RQ;
 
 	return q;
@@ -839,6 +851,8 @@ struct request_queue *blk_alloc_queue(int node_id)
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_dispatch_async:
 	blk_free_queue_dispatch_async(q);
+fail_aw:
+	kmem_cache_free(queue_atomic_write_cachep, aw_limits);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -1052,6 +1066,18 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
+static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
+						 struct bio *bio)
+{
+	if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
+		return BLK_STS_INVAL;
+
+	if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
+		return BLK_STS_INVAL;
+
+	return BLK_STS_OK;
+}
+
 static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct request_queue *q = bio->bi_disk->queue;
@@ -1133,6 +1159,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_WRITE:
+		if (bio->bi_opf & REQ_ATOMIC) {
+			status = blk_validate_atomic_write_op_size(q, bio);
+			if (status != BLK_STS_OK)
+				goto end_io;
+		}
+		break;
 	default:
 		break;
 	}
@@ -1391,7 +1424,7 @@ EXPORT_SYMBOL(submit_bio);
 static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
 				      struct request *rq)
 {
-	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+	unsigned int max_sectors = blk_queue_get_max_sectors_wrapper(rq);
 
 	if (blk_rq_sectors(rq) > max_sectors) {
 		/*
@@ -2138,6 +2171,8 @@ int __init blk_dev_init(void)
 
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+	queue_atomic_write_cachep = kmem_cache_create("queue_atomic_write",
+			sizeof(struct queue_atomic_write_limits), 0, SLAB_PANIC, NULL);
 
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a65d1d275040d833bebe3279a5daa1e266e3016a..3b2004308e93f00a9d47806bcd49e01c83fc83c1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,6 +13,46 @@
 #include "blk.h"
 #include "blk-rq-qos.h"
 
+/*
+ * rq_straddles_atomic_write_boundary - check for boundary violation
+ * @rq: request to check
+ * @front: data size to be appended to front
+ * @back: data size to be appended to back
+ *
+ * Determine whether merging a request or bio into another request will result
+ * in a merged request which straddles an atomic write boundary.
+ *
+ * The value @front_adjust is the data which would be appended to the front of
+ * @rq, while the value @back_adjust is the data which would be appended to the
+ * back of @rq. Callers will typically only have either @front_adjust or
+ * @back_adjust as non-zero.
+ *
+ */
+static bool rq_straddles_atomic_write_boundary(struct request *rq,
+					unsigned int front_adjust,
+					unsigned int back_adjust)
+{
+	unsigned int boundary = queue_atomic_write_boundary_bytes(rq->q);
+	u64 mask, start_rq_pos, end_rq_pos;
+
+	if (!boundary)
+		return false;
+
+	start_rq_pos = blk_rq_pos(rq) << SECTOR_SHIFT;
+	end_rq_pos = start_rq_pos + blk_rq_bytes(rq) - 1;
+
+	start_rq_pos -= front_adjust;
+	end_rq_pos += back_adjust;
+
+	mask = ~(boundary - 1);
+
+	/* Top bits are different, so crossed a boundary */
+	if ((start_rq_pos & mask) != (end_rq_pos & mask))
+		return true;
+
+	return false;
+}
+
 static inline bool bio_will_gap(struct request_queue *q,
 		struct request *prev_rq, struct bio *prev, struct bio *next)
 {
@@ -145,11 +185,20 @@ static inline unsigned get_max_io_size(struct request_queue *q,
 				       struct bio *bio)
 {
 	unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
-	unsigned max_sectors = sectors;
+	unsigned max_sectors;
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
 	unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
 
+	/*
+	 * We ignore lim->max_sectors for atomic writes simply because
+	 * it may less than the bio size, which we cannot tolerate.
+	 */
+	if (bio->bi_opf & REQ_ATOMIC)
+		max_sectors = q->limits.aw_limits->atomic_write_max_sectors;
+	else
+		max_sectors = sectors;
+
 	max_sectors += start_offset;
 	max_sectors &= ~(pbs - 1);
 	if (max_sectors > start_offset)
@@ -278,6 +327,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	*segs = nsegs;
 	return NULL;
 split:
+	if (bio->bi_opf & REQ_ATOMIC) {
+		bio->bi_status = BLK_STS_INVAL;
+		bio_endio(bio);
+		return ERR_PTR(-EINVAL);
+	}
 	*segs = nsegs;
 	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
@@ -594,6 +648,13 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
 		return 0;
 	}
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				bio->bi_iter.bi_size, 0)) {
+			return 0;
+		}
+	}
+
 	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
@@ -613,6 +674,13 @@ static int ll_front_merge_fn(struct request *req, struct bio *bio,
 		return 0;
 	}
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				0, bio->bi_iter.bi_size)) {
+			return 0;
+		}
+	}
+
 	return ll_new_hw_segment(req, bio, nr_segs);
 }
 
@@ -649,6 +717,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
 		return 0;
 
+	if (req->cmd_flags & REQ_ATOMIC) {
+		if (rq_straddles_atomic_write_boundary(req,
+				0, blk_rq_bytes(next))) {
+			return 0;
+		}
+	}
+
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
 	if (total_phys_segments > blk_rq_get_max_segments(req))
 		return 0;
@@ -721,6 +796,18 @@ static enum elv_merge blk_try_req_merge(struct request *req,
 	return ELEVATOR_NO_MERGE;
 }
 
+static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
+					      struct bio *bio)
+{
+	return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
+}
+
+static bool blk_atomic_write_mergeable_rqs(struct request *rq,
+					   struct request *next)
+{
+	return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -752,6 +839,9 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (req->ioprio != next->ioprio)
 		return NULL;
 
+	if (!blk_atomic_write_mergeable_rqs(req, next))
+		return NULL;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
@@ -895,6 +985,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->ioprio != bio_prio(bio))
 		return false;
 
+	if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
+		return false;
+
 	return true;
 }
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b5b17c6ee650995fbaf7d0cc30b6980feff24656..de587a442a90158b618836b4b9caa6b6688699b6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -306,6 +306,7 @@ static const char *const cmd_flag_name[] = {
 	CMD_FLAG_NAME(NOWAIT),
 	CMD_FLAG_NAME(NOUNMAP),
 	CMD_FLAG_NAME(HIPRI),
+	CMD_FLAG_NAME(ATOMIC),
 };
 #undef CMD_FLAG_NAME
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c3aa7f8ee388357c7b96b50cd58c6591910bba84..d1a1f963c3eb49c4054db6d27f017639c93f0744 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -63,6 +63,20 @@ void blk_set_default_limits(struct queue_limits *lim)
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
+void blk_set_default_atomic_write_limits(struct queue_limits *lim)
+{
+	if (lim->aw_limits) {
+		lim->aw_limits->atomic_write_hw_max = 0;
+		lim->aw_limits->atomic_write_max_sectors = 0;
+		lim->aw_limits->atomic_write_hw_boundary = 0;
+		lim->aw_limits->atomic_write_hw_unit_min = 0;
+		lim->aw_limits->atomic_write_unit_min = 0;
+		lim->aw_limits->atomic_write_hw_unit_max = 0;
+		lim->aw_limits->atomic_write_unit_max = 0;
+	}
+}
+EXPORT_SYMBOL(blk_set_default_atomic_write_limits);
+
 /**
  * blk_set_stacking_limits - set default limits for stacking devices
  * @lim:  the queue_limits structure to reset
@@ -127,6 +141,46 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/*
+ * Returns max guaranteed bytes which we can fit in a bio.
+ *
+ * We always assume that we can fit in at least PAGE_SIZE in a segment, apart
+ * from first and last segments.
+ */
+static
+unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *limits)
+{
+	unsigned int max_segments = min((u16)BIO_MAX_PAGES, limits->max_segments);
+	unsigned int length;
+
+	length = min(max_segments, 2U) * limits->logical_block_size;
+	if (max_segments > 2)
+		length += (max_segments - 2) * PAGE_SIZE;
+
+	return length;
+}
+
+void blk_atomic_writes_update_limits(struct queue_limits *limits)
+{
+	unsigned int unit_limit = min(limits->max_hw_sectors << SECTOR_SHIFT,
+					blk_queue_max_guaranteed_bio(limits));
+
+	unit_limit = rounddown_pow_of_two(unit_limit);
+
+	if (!limits->aw_limits)
+		return;
+
+	limits->aw_limits->atomic_write_max_sectors =
+		min(limits->aw_limits->atomic_write_hw_max >> SECTOR_SHIFT,
+			limits->max_hw_sectors);
+	limits->aw_limits->atomic_write_unit_min =
+		min(limits->aw_limits->atomic_write_hw_unit_min, unit_limit);
+	limits->aw_limits->atomic_write_unit_max =
+		min(limits->aw_limits->atomic_write_hw_unit_max, unit_limit);
+}
+
+EXPORT_SYMBOL(blk_atomic_writes_update_limits);
+
 /**
  * blk_queue_max_hw_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
@@ -161,6 +215,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
+
+	blk_atomic_writes_update_limits(limits);
+
 	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 53598eb6affd9569097dc795259ef18c95100d91..078aace75204946f89d464975c8813be382837ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q,
 	return queue_var_show(queue_max_discard_segments(q), (page));
 }
 
+static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_max_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_boundary_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_boundary_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_min_bytes(q), page);
+}
+
+static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q,
+						char *page)
+{
+	return queue_var_show(queue_atomic_write_unit_max_bytes(q), page);
+}
+
 static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->limits.max_integrity_segments, (page));
@@ -588,6 +612,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
 QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
 QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 
+QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
@@ -693,6 +722,10 @@ static struct attribute *queue_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_atomic_write_max_bytes_entry.attr,
+	&queue_atomic_write_boundary_entry.attr,
+	&queue_atomic_write_unit_min_entry.attr,
+	&queue_atomic_write_unit_max_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
@@ -789,6 +822,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 					       rcu_head);
 
 	percpu_ref_exit(&q->q_usage_counter);
+	kmem_cache_free(queue_atomic_write_cachep, q->limits.aw_limits);
 	kmem_cache_free(blk_requestq_cachep, q);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 5e7c00356ddc3a830e30bf409282b242a2760f10..c86d27d80ba0eca94d75650a02e8be662118ec92 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *queue_atomic_write_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index eb95b5ce7b8fe2d2f124a848d45039de6379af9b..d407fe88daeaf30ff899b6e8ef762bf931eae877 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -624,7 +624,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 	unsigned short remaining = 0;
 
 	struct dm_target *ti;
-	struct queue_limits ti_limits;
+	struct queue_limits ti_limits = {0};
 	unsigned i;
 
 	/*
@@ -1482,7 +1482,7 @@ int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits)
 {
 	struct dm_target *ti;
-	struct queue_limits ti_limits;
+	struct queue_limits ti_limits = {0};
 	unsigned i;
 	enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
 	unsigned int zone_sectors = 0;
@@ -1816,6 +1816,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
+	limits->aw_limits = q->limits.aw_limits;
 	q->limits = *limits;
 
 	if (dm_table_supports_nowait(t))
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e90b3e96fafcbb28948389451467a87375829f47..9048cfc0d00060a8c0c48c2863c5e9252d04295b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2129,7 +2129,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	struct queue_limits limits;
+	struct queue_limits limits = {0};
 	enum dm_queue_mode type = dm_get_md_type(md);
 
 	switch (type) {
@@ -2382,7 +2382,7 @@ static void dm_queue_flush(struct mapped_device *md)
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
 	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
-	struct queue_limits limits;
+	struct queue_limits limits = {0};
 	int r;
 
 	mutex_lock(&md->suspend_lock);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9fcc05c4f88cc5a1907ffcebe9e20e80e7949f4f..d52ea24deb45974f5d73574d2139cf22254de7db 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -752,6 +752,30 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 	return BLK_STS_OK;
 }
 
+static bool nvme_valid_atomic_write(struct request *req)
+{
+	struct request_queue *q = req->q;
+	u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
+
+	if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
+		return false;
+
+	if (boundary_bytes) {
+		u64 mask = boundary_bytes - 1, imask = ~mask;
+		u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
+		u64 end = start + blk_rq_bytes(req) - 1;
+
+		/* If greater then must be crossing a boundary */
+		if (blk_rq_bytes(req) > boundary_bytes)
+			return false;
+
+		if ((start & imask) != (end & imask))
+			return false;
+	}
+
+	return true;
+}
+
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_opcode op)
@@ -768,6 +792,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
+	/*
+	 * Ensure that nothing has been sent which cannot be executed
+	 * atomically.
+	 */
+	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
+		return BLK_STS_INVAL;
+
 	cmnd->rw.opcode = op;
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
@@ -2011,6 +2042,28 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
 	return 0;
 }
 
+static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
+			struct nvme_id_ns *id, struct queue_limits *lim,
+			u32 bs, u32 atomic_bs)
+{
+	unsigned int boundary = 0;
+
+	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+		if (le16_to_cpu(id->nabspf))
+			boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+	}
+
+	if (!lim->aw_limits)
+		return;
+
+	lim->aw_limits->atomic_write_hw_max = atomic_bs;
+	lim->aw_limits->atomic_write_hw_boundary = boundary;
+	lim->aw_limits->atomic_write_hw_unit_min = bs;
+	lim->aw_limits->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
+
+	blk_atomic_writes_update_limits(lim);
+}
+
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -2060,6 +2113,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
 			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
 		else
 			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+
+		nvme_update_atomic_write_disk_info(ns, id, &disk->queue->limits,
+				bs, atomic_bs);
 	}
 
 	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
diff --git a/fs/aio.c b/fs/aio.c
index 00641a1ad0b3ff0b1d4df7f1c47e25a09a245068..78aaeaf354362bdde67715ceaa9843dfac6e67a6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1458,7 +1458,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 	iocb_put(iocb);
 }
 
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
 {
 	int ret;
 
@@ -1485,7 +1485,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	} else
 		req->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
 	if (unlikely(ret))
 		return ret;
 
@@ -1537,7 +1537,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, READ);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
@@ -1565,7 +1565,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	struct file *file;
 	int ret;
 
-	ret = aio_prep_rw(req, iocb);
+	ret = aio_prep_rw(req, iocb, WRITE);
 	if (ret)
 		return ret;
 	file = req->ki_filp;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 892a4f8109e50d32df2b6cc55126156bae8c65fd..7ada6ec851017457aedbf87b525ed6d90b3e9941 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -210,15 +210,22 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	struct page *page = ZERO_PAGE(0);
 	int flags = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
+	unsigned size;
+	unsigned nr_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(GFP_KERNEL, nr_pages);
 	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
+	while (len > 0) {
+		size = len > PAGE_SIZE ? PAGE_SIZE : len;
+		get_page(page);
+		__bio_add_page(bio, page, size, 0);
+		len -= size;
+		pos += size;
+	}
 	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
 	iomap_dio_submit_bio(dio, iomap, bio, pos);
 }
@@ -227,8 +234,9 @@ static loff_t
 iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		struct iomap_dio *dio, struct iomap *iomap)
 {
+	bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
 	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int zeroing_size, pad;
 	unsigned int align = iov_iter_alignment(dio->submit.iter);
 	struct bio *bio;
 	bool need_zeroout = false;
@@ -237,6 +245,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	size_t copied = 0;
 	size_t orig_count;
 
+	zeroing_size = i_blocksize(inode) << iomap->extent_shift;
+
 	if ((pos | length | align) & ((1 << blkbits) - 1))
 		return -EINVAL;
 
@@ -280,7 +290,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (zeroing_size - 1);
 		if (pad)
 			iomap_dio_zero(dio, iomap, pos - pad, pad);
 	}
@@ -314,8 +324,16 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (is_atomic && (n != orig_count)) {
+			/* This bio should have covered the complete length */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto out;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (is_atomic)
+				bio->bi_opf |= REQ_ATOMIC;
 			if (use_fua)
 				bio->bi_opf |= REQ_FUA;
 			else
@@ -345,9 +363,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	if (need_zeroout ||
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
 		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (zeroing_size - 1);
 		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+			iomap_dio_zero(dio, iomap, pos, zeroing_size - pad);
 	}
 out:
 	/* Undo iter limitation to current extent */
diff --git a/fs/read_write.c b/fs/read_write.c
index 371a5a76f30e05d4728480012c07ec5c3bcb9661..da03b3e65cf3be6ab98bc26302e8ac9109ebef8f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
-	ret = kiocb_set_rw_flags(&kiocb, flags);
+	ret = kiocb_set_rw_flags(&kiocb, flags, type);
 	if (ret)
 		return ret;
 	kiocb.ki_pos = (ppos ? *ppos : 0);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 15e9e335d1672823c613f76fda6c926eb0cabd95..7682dfe2f7010b73bf3e6fa4d3f65b9892b2c339 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3487,6 +3487,18 @@ xfs_bmap_btalloc(
 	args.fsbno = ap->blkno;
 	args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
 
+	/*
+	 * xfs_get_cowextsz_hint() returns extsz_hint for when forcealign is
+	 * set as forcealign and cowextsz_hint are mutually exclusive
+	 */
+	if (xfs_inode_forcealign(ap->ip) && align) {
+		args.alignment = align;
+		if (stripe_align == 0 || stripe_align % align)
+			stripe_align = align;
+	} else {
+		args.alignment = 1;
+	}
+
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);
 	blen = 0;
@@ -3558,7 +3570,6 @@ xfs_bmap_btalloc(
 			atype = args.type;
 			tryagain = 1;
 			args.type = XFS_ALLOCTYPE_THIS_BNO;
-			args.alignment = 1;
 			/*
 			 * Compute the minlen+alignment for the
 			 * next case.  Set slop so that the value
@@ -3577,7 +3588,6 @@ xfs_bmap_btalloc(
 				args.minalignslop = 0;
 		}
 	} else {
-		args.alignment = 1;
 		args.minalignslop = 0;
 	}
 	args.postallocs = 1;
@@ -3604,7 +3614,9 @@ xfs_bmap_btalloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
-	if (isaligned && args.fsbno == NULLFSBLOCK) {
+
+	if (isaligned && args.fsbno == NULLFSBLOCK &&
+		(args.alignment <= 1 || !xfs_inode_forcealign(ap->ip))) {
 		/*
 		 * allocation failed, so turn off alignment and
 		 * try again.
@@ -5276,6 +5288,12 @@ __xfs_bunmapi(
 	XFS_STATS_INC(mp, xs_blk_unmap);
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	end = start + len;
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1
+			&& S_ISREG(VFS_I(ip)->i_mode)) {
+		start = roundup_64(start, ip->i_d.di_extsize);
+		end = rounddown_64(end, ip->i_d.di_extsize);
+		len  = end - start;
+	}
 
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
 		*rlen = 0;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 54832df8540f85b706389dcf996cff6e1ad6f520..8e4d4959588410eb528d180be3943d3dbf49bb26 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,11 +353,17 @@ xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30)	/* aligned file data extents */
+#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31)	/* atomicwrites enabled */
+
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
 		 XFS_SB_FEAT_RO_COMPAT_REFLINK| \
-		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+		 XFS_SB_FEAT_RO_COMPAT_INOBTCNT| \
+		 XFS_SB_FEAT_RO_COMPAT_FORCEALIGN| \
+		 XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -972,15 +978,20 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 #define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
 #define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_BIGTIME_BIT	3	/* big timestamps */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define XFS_DIFLAG2_FORCEALIGN_BIT 5
+#define XFS_DIFLAG2_ATOMICWRITES_BIT 6
 
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
 #define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 #define XFS_DIFLAG2_BIGTIME	(1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_FORCEALIGN	(1 << XFS_DIFLAG2_FORCEALIGN_BIT)
+#define XFS_DIFLAG2_ATOMICWRITES	(1 << XFS_DIFLAG2_ATOMICWRITES_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_FORCEALIGN | XFS_DIFLAG2_ATOMICWRITES)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 0970ae3fe5382770e5e675ba88de009c5370a04b..dd9e5de65d52a48434278c6cc7aed8f9f20f7d9d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -574,6 +574,14 @@ xfs_dinode_verify(
 	    !xfs_has_bigtime(mp))
 		return __this_address;
 
+	if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		fa = xfs_inode_validate_forcealign(mp, mode, flags,
+				be32_to_cpu(dip->di_extsize),
+				be32_to_cpu(dip->di_cowextsize));
+		if (fa)
+			return fa;
+	}
+
 	return NULL;
 }
 
@@ -699,3 +707,35 @@ xfs_inode_validate_cowextsize(
 
 	return NULL;
 }
+
+/* Validate the forcealign inode flag */
+xfs_failaddr_t
+xfs_inode_validate_forcealign(
+	struct xfs_mount	*mp,
+	uint16_t		mode,
+	uint16_t		flags,
+	uint32_t		extsize,
+	uint32_t		cowextsize)
+{
+	/* superblock rocompat feature flag */
+	if (!xfs_has_forcealign(mp))
+		return __this_address;
+
+	/* Only regular files and directories */
+	if (!S_ISDIR(mode) && !S_ISREG(mode))
+		return __this_address;
+
+	/* Doesn't apply to realtime files */
+	if (flags & XFS_DIFLAG_REALTIME)
+		return __this_address;
+
+	/* Requires a non-zero power-of-2 extent size hint */
+	if (extsize == 0 || !is_power_of_2(extsize))
+		return __this_address;
+
+	/* Requires no cow extent size hint */
+	if (cowextsize != 0)
+		return __this_address;
+
+	return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 05c3640e135a29a3b4f207769095b44ff045c398..1bcf1415a4b5f5c8ec1333b6f552228a70cddcf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -62,6 +62,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
 xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
 		uint32_t cowextsize, uint16_t mode, uint16_t flags,
 		uint64_t flags2);
+xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
+		uint16_t mode, uint16_t flags, uint32_t extsize,
+		uint32_t cowextsize);
 
 static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index c099ccf2787dbdefeed8584389c51ac40ec146b3..a4354504986ce61be4c528e7205284268abf21ac 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -116,6 +116,10 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_REFLINK;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
 		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+		features |= XFS_FEAT_FORCEALIGN;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES)
+		features |= XFS_FEAT_ATOMICWRITES;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index edf62092125c62a932fe846f8e3492e464b86ccd..5879f03b8660809f1be3945d55cf040ad71e56d8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -654,6 +654,9 @@ xfs_free_eofblocks(
 	 * of the file.  If not, then there is nothing to do.
 	 */
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+	/* Do not free blocks when forcing extent sizes */
+	if (xfs_get_extsz(ip) > 1)
+		end_fsb = roundup_64(end_fsb, xfs_get_extsz(ip));
 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	if (last_fsb <= end_fsb)
 		return 0;
@@ -925,8 +928,11 @@ xfs_free_file_space(
 	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
-	/* We can only free complete realtime extents. */
-	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+	/* Free only complete extents. */
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) {
+		startoffset_fsb = roundup_64(startoffset_fsb, ip->i_d.di_extsize);
+		endoffset_fsb = rounddown_64(endoffset_fsb, ip->i_d.di_extsize);
+	} else if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
 		startoffset_fsb = roundup_64(startoffset_fsb,
 					     mp->m_sb.sb_rextsize);
 		endoffset_fsb = rounddown_64(endoffset_fsb,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52643eac5d46ee692b1d171eec8effaa511e503c..49db1611de96f249a02ea9ba828425e9616719fb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -60,7 +60,10 @@ xfs_is_falloc_aligned(
 		}
 		mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
 	} else {
-		mask = mp->m_sb.sb_blocksize - 1;
+		if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1)
+			mask = (mp->m_sb.sb_blocksize * ip->i_d.di_extsize) - 1;
+		else
+			mask = mp->m_sb.sb_blocksize - 1;
 	}
 
 	return !((pos | len) & mask);
@@ -586,6 +589,14 @@ xfs_file_dio_aio_write(
 	size_t			count = iov_iter_count(from);
 	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 
+	if (iocb->ki_flags & IOCB_ATOMIC) {
+		if (!generic_atomic_write_valid(iocb->ki_pos, count,
+			i_blocksize(inode),
+			XFS_FSB_TO_B(mp, xfs_get_extsz(ip)))) {
+			return -EINVAL;
+		}
+	}
+
 	/* DIO must be aligned to device logical sector size */
 	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
@@ -597,8 +608,8 @@ xfs_file_dio_aio_write(
 	 * the inode as necessary for EOF zeroing cases and fill out the new
 	 * inode size as appropriate.
 	 */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
+	if ((iocb->ki_pos & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1)) ||
+	    ((iocb->ki_pos + count) & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1))) {
 		unaligned_io = 1;
 
 		/*
@@ -1179,6 +1190,25 @@ xfs_file_remap_range(
 	return remapped > 0 ? remapped : ret;
 }
 
+static bool xfs_file_open_can_atomicwrite(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+
+	if (!(file->f_flags & O_DIRECT))
+		return false;
+
+	if (!xfs_inode_atomicwrites(ip))
+		return false;
+
+	if (!bdev_can_atomic_write(target->bt_bdev))
+		return false;
+
+	return true;
+}
+
 STATIC int
 xfs_file_open(
 	struct inode	*inode,
@@ -1189,6 +1219,8 @@ xfs_file_open(
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+	if (xfs_file_open_can_atomicwrite(inode, file))
+		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 268bbc2d978b8cb2d4d052905dfe0d63f7d9ace7..f77a27f73bb59b89c2913926411c90ee8a6f456b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -68,6 +68,20 @@ xfs_get_extsz_hint(
 	return 0;
 }
 
+/*
+ * Helper function to extract extent size. It will return a power-of-2,
+ * as forcealign requires this.
+ */
+xfs_extlen_t
+xfs_get_extsz(
+	struct xfs_inode	*ip)
+{
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize)
+		return ip->i_d.di_extsize;
+
+	return 1;
+}
+
 /*
  * Helper function to extract CoW extent size hint from inode.
  * Between the extent size hint and the CoW extent size hint, we
@@ -643,6 +657,10 @@ _xfs_dic2xflags(
 			flags |= FS_XFLAG_DAX;
 		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
 			flags |= FS_XFLAG_COWEXTSIZE;
+		if (di_flags2 & XFS_DIFLAG2_FORCEALIGN)
+			flags |= FS_XFLAG_FORCEALIGN;
+		if (di_flags2 & XFS_DIFLAG2_ATOMICWRITES)
+			flags |= FS_XFLAG_ATOMICWRITES;
 	}
 
 	if (has_attr)
@@ -759,6 +777,18 @@ xfs_inode_inherit_flags2(
 	}
 	if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 		ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
+	if (pip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN)
+		ip->i_d.di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
+
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) {
+		xfs_failaddr_t		failaddr;
+
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, ip->i_d.di_flags, ip->i_d.di_extsize,
+				ip->i_d.di_cowextsize);
+		if (failaddr)
+			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_FORCEALIGN;
+	}
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b552daae323fcdcac75abddcc069285ab46f5cbd..b5b97be319e66e60374fb514108e2a278b328ed0 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -268,6 +268,16 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 	return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME;
 }
 
+static inline bool xfs_inode_forcealign(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN;
+}
+
+static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_ATOMICWRITES;
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
@@ -489,6 +499,7 @@ void		xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
 				struct xfs_inode *ip1, uint ip1_mode);
 
 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_extsz(struct xfs_inode *ip);
 xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2337eb272235407a7e2f7b927097a52c6b8fb516..cd2802d55def4d8bef6b478701eb5a94f573714f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1198,6 +1198,10 @@ xfs_flags2diflags2(
 		di_flags2 |= XFS_DIFLAG2_DAX;
 	if (xflags & FS_XFLAG_COWEXTSIZE)
 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	if (xflags & FS_XFLAG_FORCEALIGN)
+		di_flags2 |= XFS_DIFLAG2_FORCEALIGN;
+	if (xflags & FS_XFLAG_ATOMICWRITES)
+		di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
 
 	return di_flags2;
 }
@@ -1210,10 +1214,12 @@ xfs_ioctl_setattr_xflags(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	uint64_t		di_flags2;
+	bool			atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES;
 
-	/* Can't change realtime flag if any extents are allocated. */
+	/* Can't change realtime or atomic flag if any extents are allocated. */
 	if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
-	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
+	    (XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME) ||
+	     atomic_writes != xfs_inode_atomicwrites(ip)))
 		return -EINVAL;
 
 	/* If realtime flag is set then must have realtime device */
@@ -1236,6 +1242,29 @@ xfs_ioctl_setattr_xflags(
 	if (di_flags2 && !xfs_has_v3inodes(mp))
 		return -EINVAL;
 
+	/*
+	 * Force-align requires a nonzero extent size hint and a zero cow
+	 * extent size hint.  It doesn't apply to realtime files.
+	 */
+	if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
+		if (!xfs_has_forcealign(mp))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
+			return -EINVAL;
+		if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
+					FS_XFLAG_EXTSZINHERIT)))
+			return -EINVAL;
+		if (fa->fsx_xflags & FS_XFLAG_REALTIME)
+			return -EINVAL;
+	}
+
+	if (atomic_writes) {
+		if (!xfs_has_atomicwrites(mp))
+			return -EINVAL;
+		if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN))
+			return -EINVAL;
+	}
+
 	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
 	ip->i_d.di_flags2 = di_flags2;
 
@@ -1339,6 +1368,9 @@ xfs_ioctl_setattr_check_extsize(
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_extlen_t		size;
 	xfs_fsblock_t		extsize_fsb;
+	xfs_failaddr_t		failaddr;
+	uint16_t		new_diflags;
+	uint16_t		new_diflags2;
 
 	if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
 	    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
@@ -1363,6 +1395,17 @@ xfs_ioctl_setattr_check_extsize(
 	if (fa->fsx_extsize % size)
 		return -EINVAL;
 
+	new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+	new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+	if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+		failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+				VFS_I(ip)->i_mode, new_diflags,
+				XFS_B_TO_FSB(mp, fa->fsx_extsize),
+				XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
+		if (failaddr)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -1606,6 +1649,10 @@ xfs_ioc_setxflags(
 	}
 
 	xfs_fill_fsxattr(ip, false, &old_fa);
+	fa.fsx_extsize = old_fa.fsx_extsize;
+	fa.fsx_cowextsize = old_fa.fsx_cowextsize;
+	fa.fsx_projid = old_fa.fsx_projid;
+	fa.fsx_nextents = old_fa.fsx_nextents;
 	error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
 	if (error) {
 		xfs_trans_cancel(tp);
@@ -2069,6 +2116,28 @@ xfs_fs_eofblocks_from_user(
 	return 0;
 }
 
+static int
+xfs_ioc_set_atomic_write(
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_ioctl_setattr_get_trans(ip, NULL);
+	if (IS_ERR(tp)) {
+		error = PTR_ERR(tp);
+		goto out;
+	}
+
+	ip->i_d.di_flags2 |= XFS_DIFLAG2_ATOMICWRITES;
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_commit(tp);
+out:
+	return error;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -2096,6 +2165,31 @@ xfs_file_ioctl(
 		return xfs_ioc_getlabel(mp, arg);
 	case FS_IOC_SETFSLABEL:
 		return xfs_ioc_setlabel(filp, mp, arg);
+	case FS_IOC_SETATOMIC:
+		if (!xfs_has_atomicwrites(mp))
+			return -1;
+		if (!S_ISREG(inode->i_mode))
+			return -1;
+		if (xfs_inode_atomicwrites(ip))
+			return 0;
+		if (!xfs_inode_forcealign(ip))
+			return -1;
+
+		xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+		error = xfs_ioc_set_atomic_write(ip);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+		if (error) {
+			xfs_alert(mp, "%s: set ino 0x%llx atomic write fail!",
+					__func__, XFS_I(inode)->i_ino);
+			return -1;
+		} else {
+			struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+
+			if ((filp->f_flags & O_DIRECT) &&
+			    bdev_can_atomic_write(target->bt_bdev))
+				filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+			return 0;
+		}
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_ALLOCSP64:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 76285db4aaec4a642901134d447a761bb32d010b..dc01689988ab68baf060aaf3ca768c7814e59407 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -90,6 +90,7 @@ xfs_bmbt_to_iomap(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	xfs_extlen_t		extsz = xfs_get_extsz(ip);
 
 	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
 		return xfs_alert_fsblock_zero(ip, imap);
@@ -120,6 +121,8 @@ xfs_bmbt_to_iomap(
 
 	iomap->validity_cookie = sequence_cookie;
 	iomap->page_ops = &xfs_iomap_page_ops;
+	if (extsz > 1)
+		iomap->extent_shift = ffs(extsz) - 1;
 	return 0;
 }
 
@@ -167,7 +170,9 @@ xfs_eof_alignment(
 		 * If mounted with the "-o swalloc" option the alignment is
 		 * increased from the strip unit size to the stripe width.
 		 */
-		if (mp->m_swidth && xfs_has_swalloc(mp))
+		if (xfs_inode_forcealign(ip))
+			align = xfs_get_extsz_hint(ip);
+		else if (mp->m_swidth && xfs_has_swalloc(mp))
 			align = mp->m_swidth;
 		else if (mp->m_dalign)
 			align = mp->m_dalign;
@@ -544,11 +549,19 @@ xfs_iomap_write_unwritten(
 	xfs_fsize_t	i_size;
 	uint		resblks;
 	int		error;
+	xfs_extlen_t	extsz = xfs_get_extsz(ip);
 
 	trace_xfs_unwritten_convert(ip, offset, count);
 
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	if (extsz > 1) {
+		xfs_extlen_t extsize_bytes = XFS_FSB_TO_B(mp, extsz);
+
+		offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes));
+		count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes));
+	} else {
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	}
 	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
 
 	/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 21547ff97b5a6782fae4c06f1b2fdec74acf2748..888d6bf9bea7adf2dd30ba73b74e7b2238ce7ed8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -274,6 +274,9 @@ typedef struct xfs_mount {
 #define XFS_FEAT_INOBTCNT	(1ULL << 23)	/* inobt block counts */
 #define XFS_FEAT_BIGTIME	(1ULL << 24)	/* large timestamps */
 #define XFS_FEAT_NEEDSREPAIR	(1ULL << 25)	/* needs xfs_repair */
+#define XFS_FEAT_FORCEALIGN	(1ULL << 27)	/* aligned file data extents */
+#define XFS_FEAT_ATOMICWRITES	(1ULL << 28)	/* atomic writes support */
+
 
 /* Mount features */
 #define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
@@ -336,6 +339,8 @@ __XFS_HAS_FEAT(realtime, REALTIME)
 __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
 __XFS_HAS_FEAT(bigtime, BIGTIME)
 __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(forcealign, FORCEALIGN)
+__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES)
 
 /*
  * Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 502fb08bfd3812fafea2775a24be40b24da83872..d43f76a4b99a433f17697cf44ed2507cef076d35 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1639,6 +1639,7 @@ xfs_fc_fill_super(
 			"DAX unsupported by block device. Turning off DAX.");
 			xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
 		}
+
 		if (xfs_has_reflink(mp)) {
 			xfs_alert(mp,
 		"DAX and reflink cannot be used together!");
@@ -1657,6 +1658,14 @@ xfs_fc_fill_super(
 		}
 	}
 
+	if (xfs_has_forcealign(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
+
+	if (xfs_has_atomicwrites(mp))
+		xfs_warn(mp,
+"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!");
+
 	if (xfs_has_reflink(mp)) {
 		if (mp->m_sb.sb_rblocks) {
 			xfs_alert(mp,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5445d89ae1cfb92a88cc3b75cf3aa941f175f5f1..b49d97547009113b1f812ce3178deae39fe03171 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -133,6 +133,11 @@ typedef u8 __bitwise blk_status_t;
  */
 #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
 
+/*
+ * Invalid size or alignment.
+ */
+#define BLK_STS_INVAL			((__force blk_status_t)17)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
@@ -422,6 +427,7 @@ enum req_flag_bits {
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
+	__REQ_ATOMIC,		/* for atomic write operations */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -446,6 +452,7 @@ enum req_flag_bits {
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
+#define REQ_ATOMIC		(1ULL << __REQ_ATOMIC)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4ddfa95f266afaff94bbcd236b210ec90a3a0208..f27a0916a75e723cf5182a5ec5253268f111c2e8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -323,6 +323,17 @@ enum blk_zoned_model {
 	BLK_ZONED_HM,		/* Host-managed zoned block device */
 };
 
+struct queue_atomic_write_limits {
+	/* atomic write limits */
+	unsigned int		atomic_write_hw_max;
+	unsigned int		atomic_write_max_sectors;
+	unsigned int		atomic_write_hw_boundary;
+	unsigned int		atomic_write_hw_unit_min;
+	unsigned int		atomic_write_unit_min;
+	unsigned int		atomic_write_hw_unit_max;
+	unsigned int		atomic_write_unit_max;
+};
+
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
@@ -355,7 +366,7 @@ struct queue_limits {
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 
-	KABI_RESERVE(1)
+	KABI_USE(1, struct queue_atomic_write_limits *aw_limits)
 };
 
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
@@ -1107,6 +1118,18 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	return q->limits.max_sectors;
 }
 
+static inline unsigned int blk_queue_get_max_sectors_wrapper(struct request *rq)
+{
+
+	struct request_queue *q = rq->q;
+	int op = req_op(rq);
+
+	if (rq->cmd_flags & REQ_ATOMIC)
+		return q->limits.aw_limits->atomic_write_max_sectors;
+
+	return blk_queue_get_max_sectors(q, op);
+}
+
 /*
  * Return maximum size of a request at given offset. Only valid for
  * file system requests.
@@ -1141,10 +1164,10 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	if (!q->limits.chunk_sectors ||
 	    req_op(rq) == REQ_OP_DISCARD ||
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
-		return blk_queue_get_max_sectors(q, req_op(rq));
+		return blk_queue_get_max_sectors_wrapper(rq);
 
 	return min(blk_max_size_offset(q, offset, 0),
-			blk_queue_get_max_sectors(q, req_op(rq)));
+			blk_queue_get_max_sectors_wrapper(rq));
 }
 
 static inline unsigned int blk_rq_count_bios(struct request *rq)
@@ -1181,6 +1204,7 @@ extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_discard_segments(struct request_queue *,
 		unsigned short);
+extern void blk_atomic_writes_update_limits(struct queue_limits *limits);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
@@ -1201,6 +1225,7 @@ extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_default_limits(struct queue_limits *lim);
+extern void blk_set_default_atomic_write_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
@@ -1656,6 +1681,30 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int
+queue_atomic_write_unit_max_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_unit_max;
+}
+
+static inline unsigned int
+queue_atomic_write_unit_min_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_unit_min;
+}
+
+static inline unsigned int
+queue_atomic_write_boundary_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_hw_boundary;
+}
+
+static inline unsigned int
+queue_atomic_write_max_bytes(const struct request_queue *q)
+{
+	return q->limits.aw_limits->atomic_write_max_sectors << SECTOR_SHIFT;
+}
+
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
@@ -2109,4 +2158,24 @@ int fsync_bdev(struct block_device *bdev);
 struct super_block *freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 
+static inline bool bdev_can_atomic_write(struct block_device *bdev)
+{
+	struct request_queue *bd_queue = bdev_get_queue(bdev);
+	struct queue_limits *limits = &bd_queue->limits;
+
+	if (!limits->aw_limits->atomic_write_unit_min)
+		return false;
+
+	if (bdev_is_partition(bdev)) {
+		sector_t bd_start_sect = bdev->bd_part->start_sect;
+		unsigned int alignment =
+			max(limits->aw_limits->atomic_write_unit_min,
+			    limits->aw_limits->atomic_write_hw_boundary);
+		if (!IS_ALIGNED(bd_start_sect, alignment))
+			return false;
+	}
+
+	return true;
+}
+
 #endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 382a0d4dd3dd3a4b9e3cfa4925bf1ee104615f8b..9d7e901b71fd505b5715821b6860cb19844e65e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -184,6 +184,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File supports async buffered reads */
 #define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
 
+/* File supports atomic writes */
+#define FMODE_CAN_ATOMIC_WRITE	((__force fmode_t)0x80000000)
+
 /* File mode control flag, expect random access pattern */
 #define FMODE_CTL_RANDOM	((__force fmode_t)0x1000)
 
@@ -320,6 +323,7 @@ enum rw_hint {
 #define IOCB_SYNC		(__force int) RWF_SYNC
 #define IOCB_NOWAIT		(__force int) RWF_NOWAIT
 #define IOCB_APPEND		(__force int) RWF_APPEND
+#define IOCB_ATOMIC		(__force int) RWF_ATOMIC
 
 /* non-RWF related bits - start at 16 */
 #define IOCB_EVENTFD		(1 << 16)
@@ -3406,7 +3410,8 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
+				     int rw_type)
 {
 	int kiocb_flags = 0;
 
@@ -3423,6 +3428,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 			return -EOPNOTSUPP;
 		kiocb_flags |= IOCB_NOIO;
 	}
+	if (flags & RWF_ATOMIC) {
+		if (rw_type != WRITE)
+			return -EOPNOTSUPP;
+		if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+			return -EOPNOTSUPP;
+	}
 	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
@@ -3665,4 +3676,21 @@ static inline void fs_file_read_do_trace(struct kiocb *iocb)
 	if (tracepoint_enabled(fs_file_read))
 		fs_file_read_update_args_by_trace(iocb);
 }
+
+static inline
+bool generic_atomic_write_valid(loff_t pos, size_t len,
+			unsigned int unit_min, unsigned int unit_max)
+{
+	if (len < unit_min || len > unit_max)
+		return false;
+
+	if (!is_power_of_2(len))
+		return false;
+
+	if (!IS_ALIGNED(pos, len))
+		return false;
+
+	return true;
+}
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0965d5f12858e3d9546d0baa98ac5f128dd604f1..d14a729d40ce3de937e8b41e9a083416bb5391a3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -93,6 +93,7 @@ struct iomap {
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
+	unsigned int		extent_shift;
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afdd8cea369af1395c3637a5f69122d..332b0709756b01e60a6d6480b1f0aca345d92d67 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -140,6 +140,9 @@ struct fsxattr {
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define FS_XFLAG_FORCEALIGN	0x00020000
+#define FS_XFLAG_ATOMICWRITES	0x00040000	/* atomic writes enabled */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is
@@ -214,6 +217,7 @@ struct fsxattr {
 #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
 #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
 #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
+#define FS_IOC_SETATOMIC		_IOW(0x95, 2, uint)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
@@ -300,8 +304,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* Atomic Write */
+#define RWF_ATOMIC	((__force __kernel_rwf_t)0x00000040)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_ATOMIC)
 
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 65cf70874fb3ea14728ea1666cbb71ed8e6eb486..c284e9865826946e275eaea8006919e0cfcdc273 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2956,7 +2956,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags), rw);
 	if (unlikely(ret))
 		return ret;