diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index e34cdeeeb9d420a673f5236ff59ec99242630a40..6cee984819b33fbf35face4b7dda99c85e55d4cf 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -97,6 +97,58 @@ Description: indicates how many bytes the beginning of the device is offset from the disk's natural alignment. +What: /sys/block//atomic_write_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the maximum atomic write + size reported by the device. This parameter is relevant + for merging of writes, where a merged atomic write + operation must not exceed this number of bytes. + This parameter may be greater to the value in + atomic_write_unit_max_bytes as + atomic_write_unit_max_bytes will be rounded down to a + power-of-two and atomic_write_unit_max_bytes may also be + limited by some other queue limits, such as max_segments. + This parameter - along with atomic_write_unit_min_bytes + and atomic_write_unit_max_bytes - will not be larger than + max_hw_sectors_kb, but may be larger than max_sectors_kb. + + +What: /sys/block//atomic_write_unit_min_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the smallest block which can + be written atomically with an atomic write operation. All + atomic write operations must begin at a + atomic_write_unit_min boundary and must be multiples of + atomic_write_unit_min. This value must be a power-of-two. + + +What: /sys/block//atomic_write_unit_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter defines the largest block which can be + written atomically with an atomic write operation. This + value must be a multiple of atomic_write_unit_min and must + be a power-of-two. This value will not be larger than + atomic_write_max_bytes. + + +What: /sys/block//atomic_write_boundary_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] A device may need to internally split I/Os which + straddle a given logical block address boundary. In that + case a single atomic write operation will be processed as + one of more sub-operations which each complete atomically. + This parameter specifies the size in bytes of the atomic + boundary if one is reported by the device. This value must + be a power-of-two. + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/block/blk-core.c b/block/blk-core.c index a1ebbf96d19af7d6300723b0666877626d475fd9..e3e2659d067358110c6ce6dbc189ccf31b223434 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -81,6 +81,7 @@ __setup("precise_iostat=", precise_iostat_setup); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; +struct kmem_cache *queue_atomic_write_cachep; /* * Controlling structure to kblockd @@ -433,6 +434,8 @@ static const struct { [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -758,6 +761,7 @@ static void blk_timeout_work(struct work_struct *work) struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; + struct queue_atomic_write_limits *aw_limits; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, @@ -765,10 +769,17 @@ struct request_queue *blk_alloc_queue(int node_id) if (!q) return NULL; + aw_limits = kmem_cache_alloc_node(queue_atomic_write_cachep, + GFP_KERNEL | __GFP_ZERO, node_id); + if (!aw_limits) + goto fail_q; + + q->limits.aw_limits = aw_limits; + q->last_merge = NULL; if (blk_alloc_queue_dispatch_async(q)) - goto fail_q; + goto fail_aw; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) @@ -823,6 +834,7 @@ struct request_queue *blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); + blk_set_default_atomic_write_limits(&q->limits); q->nr_requests = BLKDEV_MAX_RQ; return q; @@ -839,6 +851,8 @@ struct request_queue *blk_alloc_queue(int node_id) ida_simple_remove(&blk_queue_ida, q->id); fail_dispatch_async: blk_free_queue_dispatch_async(q); +fail_aw: + kmem_cache_free(queue_atomic_write_cachep, aw_limits); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -1052,6 +1066,18 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; @@ -1133,6 +1159,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; + case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } + break; default: break; } @@ -1391,7 +1424,7 @@ EXPORT_SYMBOL(submit_bio); static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors_wrapper(rq); if (blk_rq_sectors(rq) > max_sectors) { /* @@ -2138,6 +2171,8 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + queue_atomic_write_cachep = kmem_cache_create("queue_atomic_write", + sizeof(struct queue_atomic_write_limits), 0, SLAB_PANIC, NULL); blk_debugfs_root = debugfs_create_dir("block", NULL); diff --git a/block/blk-merge.c b/block/blk-merge.c index a65d1d275040d833bebe3279a5daa1e266e3016a..3b2004308e93f00a9d47806bcd49e01c83fc83c1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -13,6 +13,46 @@ #include "blk.h" #include "blk-rq-qos.h" +/* + * rq_straddles_atomic_write_boundary - check for boundary violation + * @rq: request to check + * @front: data size to be appended to front + * @back: data size to be appended to back + * + * Determine whether merging a request or bio into another request will result + * in a merged request which straddles an atomic write boundary. + * + * The value @front_adjust is the data which would be appended to the front of + * @rq, while the value @back_adjust is the data which would be appended to the + * back of @rq. Callers will typically only have either @front_adjust or + * @back_adjust as non-zero. + * + */ +static bool rq_straddles_atomic_write_boundary(struct request *rq, + unsigned int front_adjust, + unsigned int back_adjust) +{ + unsigned int boundary = queue_atomic_write_boundary_bytes(rq->q); + u64 mask, start_rq_pos, end_rq_pos; + + if (!boundary) + return false; + + start_rq_pos = blk_rq_pos(rq) << SECTOR_SHIFT; + end_rq_pos = start_rq_pos + blk_rq_bytes(rq) - 1; + + start_rq_pos -= front_adjust; + end_rq_pos += back_adjust; + + mask = ~(boundary - 1); + + /* Top bits are different, so crossed a boundary */ + if ((start_rq_pos & mask) != (end_rq_pos & mask)) + return true; + + return false; +} + static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) { @@ -145,11 +185,20 @@ static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); - unsigned max_sectors = sectors; + unsigned max_sectors; unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); + /* + * We ignore lim->max_sectors for atomic writes simply because + * it may less than the bio size, which we cannot tolerate. + */ + if (bio->bi_opf & REQ_ATOMIC) + max_sectors = q->limits.aw_limits->atomic_write_max_sectors; + else + max_sectors = sectors; + max_sectors += start_offset; max_sectors &= ~(pbs - 1); if (max_sectors > start_offset) @@ -278,6 +327,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, *segs = nsegs; return NULL; split: + if (bio->bi_opf & REQ_ATOMIC) { + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return ERR_PTR(-EINVAL); + } *segs = nsegs; return bio_split(bio, sectors, GFP_NOIO, bs); } @@ -594,6 +648,13 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) return 0; } + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + bio->bi_iter.bi_size, 0)) { + return 0; + } + } + return ll_new_hw_segment(req, bio, nr_segs); } @@ -613,6 +674,13 @@ static int ll_front_merge_fn(struct request *req, struct bio *bio, return 0; } + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + 0, bio->bi_iter.bi_size)) { + return 0; + } + } + return ll_new_hw_segment(req, bio, nr_segs); } @@ -649,6 +717,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, blk_rq_get_max_sectors(req, blk_rq_pos(req))) return 0; + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + 0, blk_rq_bytes(next))) { + return 0; + } + } + total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; @@ -721,6 +796,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -752,6 +839,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->ioprio != next->ioprio) return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -895,6 +985,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->ioprio != bio_prio(bio)) return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) + return false; + return true; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b5b17c6ee650995fbaf7d0cc30b6980feff24656..de587a442a90158b618836b4b9caa6b6688699b6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -306,6 +306,7 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(HIPRI), + CMD_FLAG_NAME(ATOMIC), }; #undef CMD_FLAG_NAME diff --git a/block/blk-settings.c b/block/blk-settings.c index c3aa7f8ee388357c7b96b50cd58c6591910bba84..d1a1f963c3eb49c4054db6d27f017639c93f0744 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -63,6 +63,20 @@ void blk_set_default_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_default_limits); +void blk_set_default_atomic_write_limits(struct queue_limits *lim) +{ + if (lim->aw_limits) { + lim->aw_limits->atomic_write_hw_max = 0; + lim->aw_limits->atomic_write_max_sectors = 0; + lim->aw_limits->atomic_write_hw_boundary = 0; + lim->aw_limits->atomic_write_hw_unit_min = 0; + lim->aw_limits->atomic_write_unit_min = 0; + lim->aw_limits->atomic_write_hw_unit_max = 0; + lim->aw_limits->atomic_write_unit_max = 0; + } +} +EXPORT_SYMBOL(blk_set_default_atomic_write_limits); + /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset @@ -127,6 +141,46 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) } EXPORT_SYMBOL(blk_queue_bounce_limit); +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We always assume that we can fit in at least PAGE_SIZE in a segment, apart + * from first and last segments. + */ +static +unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *limits) +{ + unsigned int max_segments = min((u16)BIO_MAX_PAGES, limits->max_segments); + unsigned int length; + + length = min(max_segments, 2U) * limits->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +void blk_atomic_writes_update_limits(struct queue_limits *limits) +{ + unsigned int unit_limit = min(limits->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(limits)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + if (!limits->aw_limits) + return; + + limits->aw_limits->atomic_write_max_sectors = + min(limits->aw_limits->atomic_write_hw_max >> SECTOR_SHIFT, + limits->max_hw_sectors); + limits->aw_limits->atomic_write_unit_min = + min(limits->aw_limits->atomic_write_hw_unit_min, unit_limit); + limits->aw_limits->atomic_write_unit_max = + min(limits->aw_limits->atomic_write_hw_unit_max, unit_limit); +} + +EXPORT_SYMBOL(blk_atomic_writes_update_limits); + /** * blk_queue_max_hw_sectors - set max sectors for a request for this queue * @q: the request queue for the device @@ -161,6 +215,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + + blk_atomic_writes_update_limits(limits); + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 53598eb6affd9569097dc795259ef18c95100d91..078aace75204946f89d464975c8813be382837ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -126,6 +126,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q, return queue_var_show(queue_max_discard_segments(q), (page)); } +static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_max_bytes(q), page); +} + +static ssize_t queue_atomic_write_boundary_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_boundary_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_min_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_max_bytes(q), page); +} + static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { return queue_var_show(q->limits.max_integrity_segments, (page)); @@ -588,6 +612,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); @@ -693,6 +722,10 @@ static struct attribute *queue_attrs[] = { &queue_discard_max_entry.attr, &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_atomic_write_max_bytes_entry.attr, + &queue_atomic_write_boundary_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, @@ -789,6 +822,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) rcu_head); percpu_ref_exit(&q->q_usage_counter); + kmem_cache_free(queue_atomic_write_cachep, q->limits.aw_limits); kmem_cache_free(blk_requestq_cachep, q); } diff --git a/block/blk.h b/block/blk.h index 5e7c00356ddc3a830e30bf409282b242a2760f10..c86d27d80ba0eca94d75650a02e8be662118ec92 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,7 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *queue_atomic_write_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index eb95b5ce7b8fe2d2f124a848d45039de6379af9b..d407fe88daeaf30ff899b6e8ef762bf931eae877 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -624,7 +624,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, unsigned short remaining = 0; struct dm_target *ti; - struct queue_limits ti_limits; + struct queue_limits ti_limits = {0}; unsigned i; /* @@ -1482,7 +1482,7 @@ int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits) { struct dm_target *ti; - struct queue_limits ti_limits; + struct queue_limits ti_limits = {0}; unsigned i; enum blk_zoned_model zoned_model = BLK_ZONED_NONE; unsigned int zone_sectors = 0; @@ -1816,6 +1816,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * Copy table's limits to the DM device's request_queue */ + limits->aw_limits = q->limits.aw_limits; q->limits = *limits; if (dm_table_supports_nowait(t)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e90b3e96fafcbb28948389451467a87375829f47..9048cfc0d00060a8c0c48c2863c5e9252d04295b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2129,7 +2129,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; - struct queue_limits limits; + struct queue_limits limits = {0}; enum dm_queue_mode type = dm_get_md_type(md); switch (type) { @@ -2382,7 +2382,7 @@ static void dm_queue_flush(struct mapped_device *md) struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); - struct queue_limits limits; + struct queue_limits limits = {0}; int r; mutex_lock(&md->suspend_lock); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9fcc05c4f88cc5a1907ffcebe9e20e80e7949f4f..d52ea24deb45974f5d73574d2139cf22254de7db 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -752,6 +752,30 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, return BLK_STS_OK; } +static bool nvme_valid_atomic_write(struct request *req) +{ + struct request_queue *q = req->q; + u32 boundary_bytes = queue_atomic_write_boundary_bytes(q); + + if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q)) + return false; + + if (boundary_bytes) { + u64 mask = boundary_bytes - 1, imask = ~mask; + u64 start = blk_rq_pos(req) << SECTOR_SHIFT; + u64 end = start + blk_rq_bytes(req) - 1; + + /* If greater then must be crossing a boundary */ + if (blk_rq_bytes(req) > boundary_bytes) + return false; + + if ((start & imask) != (end & imask)) + return false; + } + + return true; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_opcode op) @@ -768,6 +792,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + /* + * Ensure that nothing has been sent which cannot be executed + * atomically. + */ + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) + return BLK_STS_INVAL; + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); @@ -2011,6 +2042,28 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) return 0; } +static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, + u32 bs, u32 atomic_bs) +{ + unsigned int boundary = 0; + + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { + if (le16_to_cpu(id->nabspf)) + boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } + + if (!lim->aw_limits) + return; + + lim->aw_limits->atomic_write_hw_max = atomic_bs; + lim->aw_limits->atomic_write_hw_boundary = boundary; + lim->aw_limits->atomic_write_hw_unit_min = bs; + lim->aw_limits->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); + + blk_atomic_writes_update_limits(lim); +} + static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { @@ -2060,6 +2113,9 @@ static void nvme_update_disk_info(struct gendisk *disk, atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + + nvme_update_atomic_write_disk_info(ns, id, &disk->queue->limits, + bs, atomic_bs); } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { diff --git a/fs/aio.c b/fs/aio.c index 00641a1ad0b3ff0b1d4df7f1c47e25a09a245068..78aaeaf354362bdde67715ceaa9843dfac6e67a6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1458,7 +1458,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1485,7 +1485,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1537,7 +1537,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1565,7 +1565,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 892a4f8109e50d32df2b6cc55126156bae8c65fd..7ada6ec851017457aedbf87b525ed6d90b3e9941 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -210,15 +210,22 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; + unsigned size; + unsigned nr_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); - __bio_add_page(bio, page, len, 0); + while (len > 0) { + size = len > PAGE_SIZE ? PAGE_SIZE : len; + get_page(page); + __bio_add_page(bio, page, size, 0); + len -= size; + pos += size; + } bio_set_op_attrs(bio, REQ_OP_WRITE, flags); iomap_dio_submit_bio(dio, iomap, bio, pos); } @@ -227,8 +234,9 @@ static loff_t iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) { + bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); - unsigned int fs_block_size = i_blocksize(inode), pad; + unsigned int zeroing_size, pad; unsigned int align = iov_iter_alignment(dio->submit.iter); struct bio *bio; bool need_zeroout = false; @@ -237,6 +245,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, size_t copied = 0; size_t orig_count; + zeroing_size = i_blocksize(inode) << iomap->extent_shift; + if ((pos | length | align) & ((1 << blkbits) - 1)) return -EINVAL; @@ -280,7 +290,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (need_zeroout) { /* zero out from the start of the block to the write offset */ - pad = pos & (fs_block_size - 1); + pad = pos & (zeroing_size - 1); if (pad) iomap_dio_zero(dio, iomap, pos - pad, pad); } @@ -314,8 +324,16 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } n = bio->bi_iter.bi_size; + if (is_atomic && (n != orig_count)) { + /* This bio should have covered the complete length */ + ret = -EINVAL; + bio_put(bio); + goto out; + } if (dio->flags & IOMAP_DIO_WRITE) { bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (is_atomic) + bio->bi_opf |= REQ_ATOMIC; if (use_fua) bio->bi_opf |= REQ_FUA; else @@ -345,9 +363,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ - pad = pos & (fs_block_size - 1); + pad = pos & (zeroing_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + iomap_dio_zero(dio, iomap, pos, zeroing_size - pad); } out: /* Undo iter limitation to current extent */ diff --git a/fs/read_write.c b/fs/read_write.c index 371a5a76f30e05d4728480012c07ec5c3bcb9661..da03b3e65cf3be6ab98bc26302e8ac9109ebef8f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 15e9e335d1672823c613f76fda6c926eb0cabd95..7682dfe2f7010b73bf3e6fa4d3f65b9892b2c339 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3487,6 +3487,18 @@ xfs_bmap_btalloc( args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + /* + * xfs_get_cowextsz_hint() returns extsz_hint for when forcealign is + * set as forcealign and cowextsz_hint are mutually exclusive + */ + if (xfs_inode_forcealign(ap->ip) && align) { + args.alignment = align; + if (stripe_align == 0 || stripe_align % align) + stripe_align = align; + } else { + args.alignment = 1; + } + /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); blen = 0; @@ -3558,7 +3570,6 @@ xfs_bmap_btalloc( atype = args.type; tryagain = 1; args.type = XFS_ALLOCTYPE_THIS_BNO; - args.alignment = 1; /* * Compute the minlen+alignment for the * next case. Set slop so that the value @@ -3577,7 +3588,6 @@ xfs_bmap_btalloc( args.minalignslop = 0; } } else { - args.alignment = 1; args.minalignslop = 0; } args.postallocs = 1; @@ -3604,7 +3614,9 @@ xfs_bmap_btalloc( if ((error = xfs_alloc_vextent(&args))) return error; } - if (isaligned && args.fsbno == NULLFSBLOCK) { + + if (isaligned && args.fsbno == NULLFSBLOCK && + (args.alignment <= 1 || !xfs_inode_forcealign(ap->ip))) { /* * allocation failed, so turn off alignment and * try again. @@ -5276,6 +5288,12 @@ __xfs_bunmapi( XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); end = start + len; + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1 + && S_ISREG(VFS_I(ip)->i_mode)) { + start = roundup_64(start, ip->i_d.di_extsize); + end = rounddown_64(end, ip->i_d.di_extsize); + len = end - start; + } if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { *rlen = 0; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 54832df8540f85b706389dcf996cff6e1ad6f520..8e4d4959588410eb528d180be3943d3dbf49bb26 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -353,11 +353,17 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31) /* atomicwrites enabled */ + #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ XFS_SB_FEAT_RO_COMPAT_REFLINK| \ - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + XFS_SB_FEAT_RO_COMPAT_INOBTCNT| \ + XFS_SB_FEAT_RO_COMPAT_FORCEALIGN| \ + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) + #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -972,15 +978,20 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define XFS_DIFLAG2_FORCEALIGN_BIT 5 +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT) +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_FORCEALIGN | XFS_DIFLAG2_ATOMICWRITES) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 0970ae3fe5382770e5e675ba88de009c5370a04b..dd9e5de65d52a48434278c6cc7aed8f9f20f7d9d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -574,6 +574,14 @@ xfs_dinode_verify( !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_FORCEALIGN) { + fa = xfs_inode_validate_forcealign(mp, mode, flags, + be32_to_cpu(dip->di_extsize), + be32_to_cpu(dip->di_cowextsize)); + if (fa) + return fa; + } + return NULL; } @@ -699,3 +707,35 @@ xfs_inode_validate_cowextsize( return NULL; } + +/* Validate the forcealign inode flag */ +xfs_failaddr_t +xfs_inode_validate_forcealign( + struct xfs_mount *mp, + uint16_t mode, + uint16_t flags, + uint32_t extsize, + uint32_t cowextsize) +{ + /* superblock rocompat feature flag */ + if (!xfs_has_forcealign(mp)) + return __this_address; + + /* Only regular files and directories */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* Doesn't apply to realtime files */ + if (flags & XFS_DIFLAG_REALTIME) + return __this_address; + + /* Requires a non-zero power-of-2 extent size hint */ + if (extsize == 0 || !is_power_of_2(extsize)) + return __this_address; + + /* Requires no cow extent size hint */ + if (cowextsize != 0) + return __this_address; + + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 05c3640e135a29a3b4f207769095b44ff045c398..1bcf1415a4b5f5c8ec1333b6f552228a70cddcf4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -62,6 +62,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp, + uint16_t mode, uint16_t flags, uint32_t extsize, + uint32_t cowextsize); static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index c099ccf2787dbdefeed8584389c51ac40ec146b3..a4354504986ce61be4c528e7205284268abf21ac 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -116,6 +116,10 @@ xfs_sb_version_to_features( features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN) + features |= XFS_FEAT_FORCEALIGN; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) + features |= XFS_FEAT_ATOMICWRITES; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index edf62092125c62a932fe846f8e3492e464b86ccd..5879f03b8660809f1be3945d55cf040ad71e56d8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -654,6 +654,9 @@ xfs_free_eofblocks( * of the file. If not, then there is nothing to do. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + /* Do not free blocks when forcing extent sizes */ + if (xfs_get_extsz(ip) > 1) + end_fsb = roundup_64(end_fsb, xfs_get_extsz(ip)); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return 0; @@ -925,8 +928,11 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + /* Free only complete extents. */ + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) { + startoffset_fsb = roundup_64(startoffset_fsb, ip->i_d.di_extsize); + endoffset_fsb = rounddown_64(endoffset_fsb, ip->i_d.di_extsize); + } else if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { startoffset_fsb = roundup_64(startoffset_fsb, mp->m_sb.sb_rextsize); endoffset_fsb = rounddown_64(endoffset_fsb, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 52643eac5d46ee692b1d171eec8effaa511e503c..49db1611de96f249a02ea9ba828425e9616719fb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -60,7 +60,10 @@ xfs_is_falloc_aligned( } mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; } else { - mask = mp->m_sb.sb_blocksize - 1; + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) + mask = (mp->m_sb.sb_blocksize * ip->i_d.di_extsize) - 1; + else + mask = mp->m_sb.sb_blocksize - 1; } return !((pos | len) & mask); @@ -586,6 +589,14 @@ xfs_file_dio_aio_write( size_t count = iov_iter_count(from); struct xfs_buftarg *target = xfs_inode_buftarg(ip); + if (iocb->ki_flags & IOCB_ATOMIC) { + if (!generic_atomic_write_valid(iocb->ki_pos, count, + i_blocksize(inode), + XFS_FSB_TO_B(mp, xfs_get_extsz(ip)))) { + return -EINVAL; + } + } + /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; @@ -597,8 +608,8 @@ xfs_file_dio_aio_write( * the inode as necessary for EOF zeroing cases and fill out the new * inode size as appropriate. */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) { + if ((iocb->ki_pos & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1)) || + ((iocb->ki_pos + count) & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1))) { unaligned_io = 1; /* @@ -1179,6 +1190,25 @@ xfs_file_remap_range( return remapped > 0 ? remapped : ret; } +static bool xfs_file_open_can_atomicwrite( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (!(file->f_flags & O_DIRECT)) + return false; + + if (!xfs_inode_atomicwrites(ip)) + return false; + + if (!bdev_can_atomic_write(target->bt_bdev)) + return false; + + return true; +} + STATIC int xfs_file_open( struct inode *inode, @@ -1189,6 +1219,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + if (xfs_file_open_can_atomicwrite(inode, file)) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 268bbc2d978b8cb2d4d052905dfe0d63f7d9ace7..f77a27f73bb59b89c2913926411c90ee8a6f456b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -68,6 +68,20 @@ xfs_get_extsz_hint( return 0; } +/* + * Helper function to extract extent size. It will return a power-of-2, + * as forcealign requires this. + */ +xfs_extlen_t +xfs_get_extsz( + struct xfs_inode *ip) +{ + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + + return 1; +} + /* * Helper function to extract CoW extent size hint from inode. * Between the extent size hint and the CoW extent size hint, we @@ -643,6 +657,10 @@ _xfs_dic2xflags( flags |= FS_XFLAG_DAX; if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; + if (di_flags2 & XFS_DIFLAG2_FORCEALIGN) + flags |= FS_XFLAG_FORCEALIGN; + if (di_flags2 & XFS_DIFLAG2_ATOMICWRITES) + flags |= FS_XFLAG_ATOMICWRITES; } if (has_attr) @@ -759,6 +777,18 @@ xfs_inode_inherit_flags2( } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) + ip->i_d.di_flags2 |= XFS_DIFLAG2_FORCEALIGN; + + if (ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) { + xfs_failaddr_t failaddr; + + failaddr = xfs_inode_validate_forcealign(ip->i_mount, + VFS_I(ip)->i_mode, ip->i_d.di_flags, ip->i_d.di_extsize, + ip->i_d.di_cowextsize); + if (failaddr) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_FORCEALIGN; + } } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b552daae323fcdcac75abddcc069285ab46f5cbd..b5b97be319e66e60374fb514108e2a278b328ed0 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -268,6 +268,16 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_forcealign(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN; +} + +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_ATOMICWRITES; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -489,6 +499,7 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2337eb272235407a7e2f7b927097a52c6b8fb516..cd2802d55def4d8bef6b478701eb5a94f573714f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1198,6 +1198,10 @@ xfs_flags2diflags2( di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + if (xflags & FS_XFLAG_FORCEALIGN) + di_flags2 |= XFS_DIFLAG2_FORCEALIGN; + if (xflags & FS_XFLAG_ATOMICWRITES) + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; return di_flags2; } @@ -1210,10 +1214,12 @@ xfs_ioctl_setattr_xflags( { struct xfs_mount *mp = ip->i_mount; uint64_t di_flags2; + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; - /* Can't change realtime flag if any extents are allocated. */ + /* Can't change realtime or atomic flag if any extents are allocated. */ if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) + (XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME) || + atomic_writes != xfs_inode_atomicwrites(ip))) return -EINVAL; /* If realtime flag is set then must have realtime device */ @@ -1236,6 +1242,29 @@ xfs_ioctl_setattr_xflags( if (di_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; + /* + * Force-align requires a nonzero extent size hint and a zero cow + * extent size hint. It doesn't apply to realtime files. + */ + if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) { + if (!xfs_has_forcealign(mp)) + return -EINVAL; + if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) + return -EINVAL; + if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE | + FS_XFLAG_EXTSZINHERIT))) + return -EINVAL; + if (fa->fsx_xflags & FS_XFLAG_REALTIME) + return -EINVAL; + } + + if (atomic_writes) { + if (!xfs_has_atomicwrites(mp)) + return -EINVAL; + if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN)) + return -EINVAL; + } + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; @@ -1339,6 +1368,9 @@ xfs_ioctl_setattr_check_extsize( struct xfs_mount *mp = ip->i_mount; xfs_extlen_t size; xfs_fsblock_t extsize_fsb; + xfs_failaddr_t failaddr; + uint16_t new_diflags; + uint16_t new_diflags2; if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) @@ -1363,6 +1395,17 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) { + failaddr = xfs_inode_validate_forcealign(ip->i_mount, + VFS_I(ip)->i_mode, new_diflags, + XFS_B_TO_FSB(mp, fa->fsx_extsize), + XFS_B_TO_FSB(mp, fa->fsx_cowextsize)); + if (failaddr) + return -EINVAL; + } + return 0; } @@ -1606,6 +1649,10 @@ xfs_ioc_setxflags( } xfs_fill_fsxattr(ip, false, &old_fa); + fa.fsx_extsize = old_fa.fsx_extsize; + fa.fsx_cowextsize = old_fa.fsx_cowextsize; + fa.fsx_projid = old_fa.fsx_projid; + fa.fsx_nextents = old_fa.fsx_nextents; error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa); if (error) { xfs_trans_cancel(tp); @@ -2069,6 +2116,28 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioc_set_atomic_write( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + tp = xfs_ioctl_setattr_get_trans(ip, NULL); + if (IS_ERR(tp)) { + error = PTR_ERR(tp); + goto out; + } + + ip->i_d.di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); +out: + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -2096,6 +2165,31 @@ xfs_file_ioctl( return xfs_ioc_getlabel(mp, arg); case FS_IOC_SETFSLABEL: return xfs_ioc_setlabel(filp, mp, arg); + case FS_IOC_SETATOMIC: + if (!xfs_has_atomicwrites(mp)) + return -1; + if (!S_ISREG(inode->i_mode)) + return -1; + if (xfs_inode_atomicwrites(ip)) + return 0; + if (!xfs_inode_forcealign(ip)) + return -1; + + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + error = xfs_ioc_set_atomic_write(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + if (error) { + xfs_alert(mp, "%s: set ino 0x%llx atomic write fail!", + __func__, XFS_I(inode)->i_ino); + return -1; + } else { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if ((filp->f_flags & O_DIRECT) && + bdev_can_atomic_write(target->bt_bdev)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + return 0; + } case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 76285db4aaec4a642901134d447a761bb32d010b..dc01689988ab68baf060aaf3ca768c7814e59407 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -90,6 +90,7 @@ xfs_bmbt_to_iomap( { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); + xfs_extlen_t extsz = xfs_get_extsz(ip); if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); @@ -120,6 +121,8 @@ xfs_bmbt_to_iomap( iomap->validity_cookie = sequence_cookie; iomap->page_ops = &xfs_iomap_page_ops; + if (extsz > 1) + iomap->extent_shift = ffs(extsz) - 1; return 0; } @@ -167,7 +170,9 @@ xfs_eof_alignment( * If mounted with the "-o swalloc" option the alignment is * increased from the strip unit size to the stripe width. */ - if (mp->m_swidth && xfs_has_swalloc(mp)) + if (xfs_inode_forcealign(ip)) + align = xfs_get_extsz_hint(ip); + else if (mp->m_swidth && xfs_has_swalloc(mp)) align = mp->m_swidth; else if (mp->m_dalign) align = mp->m_dalign; @@ -544,11 +549,19 @@ xfs_iomap_write_unwritten( xfs_fsize_t i_size; uint resblks; int error; + xfs_extlen_t extsz = xfs_get_extsz(ip); trace_xfs_unwritten_convert(ip, offset, count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + if (extsz > 1) { + xfs_extlen_t extsize_bytes = XFS_FSB_TO_B(mp, extsz); + + offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes)); + count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes)); + } else { + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + } count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 21547ff97b5a6782fae4c06f1b2fdec74acf2748..888d6bf9bea7adf2dd30ba73b74e7b2238ce7ed8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -274,6 +274,9 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_FORCEALIGN (1ULL << 27) /* aligned file data extents */ +#define XFS_FEAT_ATOMICWRITES (1ULL << 28) /* atomic writes support */ + /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -336,6 +339,8 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(forcealign, FORCEALIGN) +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES) /* * Mount features diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 502fb08bfd3812fafea2775a24be40b24da83872..d43f76a4b99a433f17697cf44ed2507cef076d35 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1639,6 +1639,7 @@ xfs_fc_fill_super( "DAX unsupported by block device. Turning off DAX."); xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } + if (xfs_has_reflink(mp)) { xfs_alert(mp, "DAX and reflink cannot be used together!"); @@ -1657,6 +1658,14 @@ xfs_fc_fill_super( } } + if (xfs_has_forcealign(mp)) + xfs_warn(mp, +"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!"); + + if (xfs_has_atomicwrites(mp)) + xfs_warn(mp, +"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!"); + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5445d89ae1cfb92a88cc3b75cf3aa941f175f5f1..b49d97547009113b1f812ce3178deae39fe03171 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -133,6 +133,11 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)17) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -422,6 +427,7 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ + __REQ_ATOMIC, /* for atomic write operations */ __REQ_NR_BITS, /* stops here */ }; @@ -446,6 +452,7 @@ enum req_flag_bits { #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) +#define REQ_ATOMIC (1ULL << __REQ_ATOMIC) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4ddfa95f266afaff94bbcd236b210ec90a3a0208..f27a0916a75e723cf5182a5ec5253268f111c2e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,17 @@ enum blk_zoned_model { BLK_ZONED_HM, /* Host-managed zoned block device */ }; +struct queue_atomic_write_limits { + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -355,7 +366,7 @@ struct queue_limits { unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; - KABI_RESERVE(1) + KABI_USE(1, struct queue_atomic_write_limits *aw_limits) }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -1107,6 +1118,18 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, return q->limits.max_sectors; } +static inline unsigned int blk_queue_get_max_sectors_wrapper(struct request *rq) +{ + + struct request_queue *q = rq->q; + int op = req_op(rq); + + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.aw_limits->atomic_write_max_sectors; + + return blk_queue_get_max_sectors(q, op); +} + /* * Return maximum size of a request at given offset. Only valid for * file system requests. @@ -1141,10 +1164,10 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); + return blk_queue_get_max_sectors_wrapper(rq); return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); + blk_queue_get_max_sectors_wrapper(rq)); } static inline unsigned int blk_rq_count_bios(struct request *rq) @@ -1181,6 +1204,7 @@ extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_discard_segments(struct request_queue *, unsigned short); +extern void blk_atomic_writes_update_limits(struct queue_limits *limits); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1201,6 +1225,7 @@ extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); +extern void blk_set_default_atomic_write_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); @@ -1656,6 +1681,30 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return 0; } +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_hw_boundary; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_max_sectors << SECTOR_SHIFT; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; @@ -2109,4 +2158,24 @@ int fsync_bdev(struct block_device *bdev); struct super_block *freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev, struct super_block *sb); +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev_get_queue(bdev); + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->aw_limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_part->start_sect; + unsigned int alignment = + max(limits->aw_limits->atomic_write_unit_min, + limits->aw_limits->atomic_write_hw_boundary); + if (!IS_ALIGNED(bd_start_sect, alignment)) + return false; + } + + return true; +} + #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 382a0d4dd3dd3a4b9e3cfa4925bf1ee104615f8b..9d7e901b71fd505b5715821b6860cb19844e65e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -184,6 +184,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)0x80000000) + /* File mode control flag, expect random access pattern */ #define FMODE_CTL_RANDOM ((__force fmode_t)0x1000) @@ -320,6 +323,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -3406,7 +3410,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3423,6 +3428,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3665,4 +3676,21 @@ static inline void fs_file_read_do_trace(struct kiocb *iocb) if (tracepoint_enabled(fs_file_read)) fs_file_read_update_args_by_trace(iocb); } + +static inline +bool generic_atomic_write_valid(loff_t pos, size_t len, + unsigned int unit_min, unsigned int unit_max) +{ + if (len < unit_min || len > unit_max) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0965d5f12858e3d9546d0baa98ac5f128dd604f1..d14a729d40ce3de937e8b41e9a083416bb5391a3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -93,6 +93,7 @@ struct iomap { u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ + unsigned int extent_shift; struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afdd8cea369af1395c3637a5f69122d..332b0709756b01e60a6d6480b1f0aca345d92d67 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -140,6 +140,9 @@ struct fsxattr { #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define FS_XFLAG_FORCEALIGN 0x00020000 +#define FS_XFLAG_ATOMICWRITES 0x00040000 /* atomic writes enabled */ #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* the read-only stuff doesn't really belong here, but any other place is @@ -214,6 +217,7 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +#define FS_IOC_SETATOMIC _IOW(0x95, 2, uint) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -300,8 +304,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_ATOMIC) #endif /* _UAPI_LINUX_FS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65cf70874fb3ea14728ea1666cbb71ed8e6eb486..c284e9865826946e275eaea8006919e0cfcdc273 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2956,7 +2956,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags), rw); if (unlikely(ret)) return ret;