diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index e34cdeeeb9d420a673f5236ff59ec99242630a40..6cee984819b33fbf35face4b7dda99c85e55d4cf 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -97,6 +97,58 @@ Description: indicates how many bytes the beginning of the device is offset from the disk's natural alignment. +What: /sys/block//atomic_write_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the maximum atomic write + size reported by the device. This parameter is relevant + for merging of writes, where a merged atomic write + operation must not exceed this number of bytes. + This parameter may be greater to the value in + atomic_write_unit_max_bytes as + atomic_write_unit_max_bytes will be rounded down to a + power-of-two and atomic_write_unit_max_bytes may also be + limited by some other queue limits, such as max_segments. + This parameter - along with atomic_write_unit_min_bytes + and atomic_write_unit_max_bytes - will not be larger than + max_hw_sectors_kb, but may be larger than max_sectors_kb. + + +What: /sys/block//atomic_write_unit_min_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the smallest block which can + be written atomically with an atomic write operation. All + atomic write operations must begin at a + atomic_write_unit_min boundary and must be multiples of + atomic_write_unit_min. This value must be a power-of-two. + + +What: /sys/block//atomic_write_unit_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter defines the largest block which can be + written atomically with an atomic write operation. This + value must be a multiple of atomic_write_unit_min and must + be a power-of-two. This value will not be larger than + atomic_write_max_bytes. + + +What: /sys/block//atomic_write_boundary_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] A device may need to internally split I/Os which + straddle a given logical block address boundary. In that + case a single atomic write operation will be processed as + one of more sub-operations which each complete atomically. + This parameter specifies the size in bytes of the atomic + boundary if one is reported by the device. This value must + be a power-of-two. + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/block/blk-core.c b/block/blk-core.c index f91f8e8be482d1c93809ddb936437db45c0d8c91..df6564d773e648f6e0d0b7f622d824f64fe5b8e9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -81,6 +81,7 @@ __setup("precise_iostat=", precise_iostat_setup); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; +struct kmem_cache *queue_atomic_write_cachep; /* * Controlling structure to kblockd @@ -205,6 +206,8 @@ static const struct { [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -530,6 +533,7 @@ static void blk_timeout_work(struct work_struct *work) struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; + struct queue_atomic_write_limits *aw_limits; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, @@ -537,11 +541,18 @@ struct request_queue *blk_alloc_queue(int node_id) if (!q) return NULL; + aw_limits = kmem_cache_alloc_node(queue_atomic_write_cachep, + GFP_KERNEL | __GFP_ZERO, node_id); + if (!aw_limits) + goto fail_q; + + q->limits.aw_limits = aw_limits; + q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) - goto fail_q; + goto fail_aw; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) @@ -592,6 +603,7 @@ struct request_queue *blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); + blk_set_default_atomic_write_limits(&q->limits); q->nr_requests = BLKDEV_MAX_RQ; return q; @@ -606,6 +618,8 @@ struct request_queue *blk_alloc_queue(int node_id) bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_aw: + kmem_cache_free(queue_atomic_write_cachep, aw_limits); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -819,6 +833,18 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; @@ -900,6 +926,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; + case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } + break; default: break; } @@ -1155,7 +1188,7 @@ EXPORT_SYMBOL(submit_bio); static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); if (blk_rq_sectors(rq) > max_sectors) { /* @@ -1902,6 +1935,8 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + queue_atomic_write_cachep = kmem_cache_create("queue_atomic_write", + sizeof(struct queue_atomic_write_limits), 0, SLAB_PANIC, NULL); blk_debugfs_root = debugfs_create_dir("block", NULL); diff --git a/block/blk-merge.c b/block/blk-merge.c index a65d1d275040d833bebe3279a5daa1e266e3016a..3b2004308e93f00a9d47806bcd49e01c83fc83c1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -13,6 +13,46 @@ #include "blk.h" #include "blk-rq-qos.h" +/* + * rq_straddles_atomic_write_boundary - check for boundary violation + * @rq: request to check + * @front: data size to be appended to front + * @back: data size to be appended to back + * + * Determine whether merging a request or bio into another request will result + * in a merged request which straddles an atomic write boundary. + * + * The value @front_adjust is the data which would be appended to the front of + * @rq, while the value @back_adjust is the data which would be appended to the + * back of @rq. Callers will typically only have either @front_adjust or + * @back_adjust as non-zero. + * + */ +static bool rq_straddles_atomic_write_boundary(struct request *rq, + unsigned int front_adjust, + unsigned int back_adjust) +{ + unsigned int boundary = queue_atomic_write_boundary_bytes(rq->q); + u64 mask, start_rq_pos, end_rq_pos; + + if (!boundary) + return false; + + start_rq_pos = blk_rq_pos(rq) << SECTOR_SHIFT; + end_rq_pos = start_rq_pos + blk_rq_bytes(rq) - 1; + + start_rq_pos -= front_adjust; + end_rq_pos += back_adjust; + + mask = ~(boundary - 1); + + /* Top bits are different, so crossed a boundary */ + if ((start_rq_pos & mask) != (end_rq_pos & mask)) + return true; + + return false; +} + static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) { @@ -145,11 +185,20 @@ static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); - unsigned max_sectors = sectors; + unsigned max_sectors; unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); + /* + * We ignore lim->max_sectors for atomic writes simply because + * it may less than the bio size, which we cannot tolerate. + */ + if (bio->bi_opf & REQ_ATOMIC) + max_sectors = q->limits.aw_limits->atomic_write_max_sectors; + else + max_sectors = sectors; + max_sectors += start_offset; max_sectors &= ~(pbs - 1); if (max_sectors > start_offset) @@ -278,6 +327,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, *segs = nsegs; return NULL; split: + if (bio->bi_opf & REQ_ATOMIC) { + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return ERR_PTR(-EINVAL); + } *segs = nsegs; return bio_split(bio, sectors, GFP_NOIO, bs); } @@ -594,6 +648,13 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) return 0; } + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + bio->bi_iter.bi_size, 0)) { + return 0; + } + } + return ll_new_hw_segment(req, bio, nr_segs); } @@ -613,6 +674,13 @@ static int ll_front_merge_fn(struct request *req, struct bio *bio, return 0; } + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + 0, bio->bi_iter.bi_size)) { + return 0; + } + } + return ll_new_hw_segment(req, bio, nr_segs); } @@ -649,6 +717,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, blk_rq_get_max_sectors(req, blk_rq_pos(req))) return 0; + if (req->cmd_flags & REQ_ATOMIC) { + if (rq_straddles_atomic_write_boundary(req, + 0, blk_rq_bytes(next))) { + return 0; + } + } + total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; @@ -721,6 +796,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -752,6 +839,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->ioprio != next->ioprio) return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -895,6 +985,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->ioprio != bio_prio(bio)) return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) + return false; + return true; } diff --git a/block/blk-settings.c b/block/blk-settings.c index c3aa7f8ee388357c7b96b50cd58c6591910bba84..8d378d62178c2f2f35ced6307d317e19fd583547 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -63,6 +63,20 @@ void blk_set_default_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_default_limits); +void blk_set_default_atomic_write_limits(struct queue_limits *lim) +{ + if (lim->aw_limits) { + lim->aw_limits->atomic_write_hw_max = 0; + lim->aw_limits->atomic_write_max_sectors = 0; + lim->aw_limits->atomic_write_hw_boundary = 0; + lim->aw_limits->atomic_write_hw_unit_min = 0; + lim->aw_limits->atomic_write_unit_min = 0; + lim->aw_limits->atomic_write_hw_unit_max = 0; + lim->aw_limits->atomic_write_unit_max = 0; + } +} +EXPORT_SYMBOL(blk_set_default_atomic_write_limits); + /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset @@ -127,6 +141,44 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) } EXPORT_SYMBOL(blk_queue_bounce_limit); +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We always assume that we can fit in at least PAGE_SIZE in a segment, apart + * from first and last segments. + */ +static +unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *limits) +{ + unsigned int max_segments = min((u16)BIO_MAX_PAGES, limits->max_segments); + unsigned int length; + + length = min(max_segments, 2U) * limits->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +static void blk_atomic_writes_update_limits(struct queue_limits *limits) +{ + unsigned int unit_limit = min(limits->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(limits)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + if (!limits->aw_limits) + return; + + limits->aw_limits->atomic_write_max_sectors = + min(limits->aw_limits->atomic_write_hw_max >> SECTOR_SHIFT, + limits->max_hw_sectors); + limits->aw_limits->atomic_write_unit_min = + min(limits->aw_limits->atomic_write_hw_unit_min, unit_limit); + limits->aw_limits->atomic_write_unit_max = + min(limits->aw_limits->atomic_write_hw_unit_max, unit_limit); +} + /** * blk_queue_max_hw_sectors - set max sectors for a request for this queue * @q: the request queue for the device @@ -161,6 +213,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + + blk_atomic_writes_update_limits(limits); + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -196,6 +251,62 @@ void blk_queue_max_discard_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_discard_sectors); +/** + * blk_queue_atomic_write_max_bytes - set max bytes supported by + * the device for atomic write operations. + * @q: the request queue for the device + * @bytes: maximum bytes supported + */ +void blk_queue_atomic_write_max_bytes(struct request_queue *q, + unsigned int bytes) +{ + q->limits.aw_limits->atomic_write_hw_max = bytes; + blk_atomic_writes_update_limits(&q->limits); +} +EXPORT_SYMBOL(blk_queue_atomic_write_max_bytes); + +/** + * blk_queue_atomic_write_boundary_bytes - Device's logical block address space + * which an atomic write should not cross. + * @q: the request queue for the device + * @bytes: must be a power-of-two. + */ +void blk_queue_atomic_write_boundary_bytes(struct request_queue *q, + unsigned int bytes) +{ + q->limits.aw_limits->atomic_write_hw_boundary = bytes; +} +EXPORT_SYMBOL(blk_queue_atomic_write_boundary_bytes); + +/** + * blk_queue_atomic_write_unit_min_bytes - smallest unit that can be written + * atomically to the device. + * @q: the request queue for the device + * @bytes: must be a power-of-two. + */ +void blk_queue_atomic_write_unit_min_bytes(struct request_queue *q, + unsigned int bytes) +{ + q->limits.aw_limits->atomic_write_hw_unit_min = bytes; + blk_atomic_writes_update_limits(&q->limits); +} +EXPORT_SYMBOL(blk_queue_atomic_write_unit_min_bytes); + +/* + * blk_queue_atomic_write_unit_max_bytes - largest unit that can be written + * atomically to the device. + * @q: the request queue for the device + * @bytes: must be a power-of-two. + */ +void blk_queue_atomic_write_unit_max_bytes(struct request_queue *q, + unsigned int bytes) +{ + q->limits.aw_limits->atomic_write_hw_unit_max = bytes; + blk_atomic_writes_update_limits(&q->limits); +} +EXPORT_SYMBOL(blk_queue_atomic_write_unit_max_bytes); + + /** * blk_queue_max_write_same_sectors - set max sectors for a single write same * @q: the request queue for the device diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c95be9626a09807a07d8b5563540edd3e7a80de7..8e35d63b30c245efe8726eac0893d6b1fcd684a5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -126,6 +126,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q, return queue_var_show(queue_max_discard_segments(q), (page)); } +static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_max_bytes(q), page); +} + +static ssize_t queue_atomic_write_boundary_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_boundary_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_min_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_max_bytes(q), page); +} + static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { return queue_var_show(q->limits.max_integrity_segments, (page)); @@ -585,6 +609,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); @@ -639,6 +668,10 @@ static struct attribute *queue_attrs[] = { &queue_discard_max_entry.attr, &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_atomic_write_max_bytes_entry.attr, + &queue_atomic_write_boundary_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, @@ -731,6 +764,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) rcu_head); percpu_ref_exit(&q->q_usage_counter); + kmem_cache_free(queue_atomic_write_cachep, q->limits.aw_limits); kmem_cache_free(blk_requestq_cachep, q); } diff --git a/block/blk.h b/block/blk.h index 4bbcc971d4f734d5a03163748ed5924dbb4c5389..1bdd07040d42f6e8af87574f2223210aa604bc12 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,7 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *queue_atomic_write_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h index 0eb23850afb954fc90a72cae9c252296b7fff06e..e41ce52ab8f1e99f0e6895364d857980521e0781 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -66,8 +66,7 @@ static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) if (!blk_queue_discard(bdev_get_queue(dev->bdev))) return 0; - return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev), - REQ_OP_DISCARD); + return bdev_get_queue(dev->bdev)->limits.max_discard_sectors; } static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index eb95b5ce7b8fe2d2f124a848d45039de6379af9b..d8f54d2a74d9ccd0e07d001950e82bbeb4613323 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1816,6 +1816,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * Copy table's limits to the DM device's request_queue */ + limits->aw_limits = q->limits.aw_limits; q->limits = *limits; if (dm_table_supports_nowait(t)) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9fcc05c4f88cc5a1907ffcebe9e20e80e7949f4f..76626f5a737f50631c4aa9b1a91c96a343f8cba5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -752,6 +752,30 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, return BLK_STS_OK; } +static bool nvme_valid_atomic_write(struct request *req) +{ + struct request_queue *q = req->q; + u32 boundary_bytes = queue_atomic_write_boundary_bytes(q); + + if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q)) + return false; + + if (boundary_bytes) { + u64 mask = boundary_bytes - 1, imask = ~mask; + u64 start = blk_rq_pos(req) << SECTOR_SHIFT; + u64 end = start + blk_rq_bytes(req) - 1; + + /* If greater then must be crossing a boundary */ + if (blk_rq_bytes(req) > boundary_bytes) + return false; + + if ((start & imask) != (end & imask)) + return false; + } + + return true; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_opcode op) @@ -768,6 +792,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + /* + * Ensure that nothing has been sent which cannot be executed + * atomically. + */ + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) + return BLK_STS_INVAL; + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); @@ -2011,6 +2042,26 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) return 0; } +static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, + u32 bs, u32 atomic_bs) +{ + unsigned int boundary = 0; + + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { + if (le16_to_cpu(id->nabspf)) + boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } + + if (!lim->aw_limits) + return; + + lim->aw_limits->atomic_write_hw_max = atomic_bs; + lim->aw_limits->atomic_write_hw_boundary = boundary; + lim->aw_limits->atomic_write_hw_unit_min = bs; + lim->aw_limits->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); +} + static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, struct request_queue *q) { @@ -2060,6 +2111,9 @@ static void nvme_update_disk_info(struct gendisk *disk, atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + + nvme_update_atomic_write_disk_info(ns, id, &disk->queue->limits, + bs, atomic_bs); } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { @@ -2160,6 +2214,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) goto out_unfreeze; nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(ns->disk, ns, id); + nvme_set_queue_limits(ns->ctrl, ns->queue); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { @@ -2172,6 +2227,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (ns->head->disk) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); + nvme_set_queue_limits(ns->ctrl, ns->head->disk->queue); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); blk_queue_update_readahead(ns->head->disk->queue); diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c index f1406ac77b0d595b3cae330e53646ee2d69e70c4..ea0a08a1696a018cddbd7d008bceb99fb0a492d7 100644 --- a/drivers/scsi/cxlflash/vlun.c +++ b/drivers/scsi/cxlflash/vlun.c @@ -430,8 +430,8 @@ static int write_same16(struct scsi_device *sdev, struct device *dev = &cfg->dev->dev; const u32 s = ilog2(sdev->sector_size) - 9; const u32 to = sdev->request_queue->rq_timeout; - const u32 ws_limit = blk_queue_get_max_sectors(sdev->request_queue, - REQ_OP_WRITE_SAME) >> s; + const u32 ws_limit = + sdev->request_queue->limits.max_write_zeroes_sectors >> s; cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); diff --git a/fs/aio.c b/fs/aio.c index 00641a1ad0b3ff0b1d4df7f1c47e25a09a245068..78aaeaf354362bdde67715ceaa9843dfac6e67a6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1458,7 +1458,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1485,7 +1485,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1537,7 +1537,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1565,7 +1565,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/block_dev.c b/fs/block_dev.c index b9104fc0a395d29773d765b2bd863452e4488aba..84060c7d80a15179ac5c8a4dc223ddc9aee71a46 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -272,6 +272,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; + ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) goto out; @@ -452,6 +455,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -521,14 +528,30 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return ret; } +static bool blkdev_atomic_write_valid(struct block_device *bdev, loff_t pos, + struct iov_iter *iter) +{ + struct request_queue *q = bdev_get_queue(bdev); + unsigned int min_bytes = queue_atomic_write_unit_min_bytes(q); + unsigned int max_bytes = queue_atomic_write_unit_max_bytes(q); + + return generic_atomic_write_valid(pos, iter, min_bytes, max_bytes); +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); if (!nr_pages) return 0; + + if (is_atomic && !blkdev_atomic_write_valid(bdev, iocb->ki_pos, iter)) + return -EINVAL; + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); @@ -1863,6 +1886,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev); static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; + int err; /* * Preserve backwards compatibility and allow large file access @@ -1888,7 +1912,11 @@ static int blkdev_open(struct inode * inode, struct file * filp) filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - return blkdev_get(bdev, filp->f_mode, filp); + err = blkdev_get(bdev, filp->f_mode, filp); + if (!err && bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + + return err; } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 25448d5827d259d63743e8cefc042b2d5338fe4e..d4778d251d41a6e5b505a2af6a3b6b9dd94d2ad3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "trace.h" #include "../internal.h" @@ -1046,11 +1047,10 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, EXPORT_SYMBOL_GPL(iomap_zero_range); int -iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) +iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize, + bool *did_zero, const struct iomap_ops *ops) { - unsigned int blocksize = i_blocksize(inode); - unsigned int off = pos & (blocksize - 1); + unsigned int off = rem_u64(pos, blocksize); /* Block boundary? Nothing to do */ if (!off) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 892a4f8109e50d32df2b6cc55126156bae8c65fd..063ea9eb8afb21534f8afb1db5a4e0244621516a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -210,15 +210,22 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; + unsigned size; + unsigned nr_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); - __bio_add_page(bio, page, len, 0); + while (len > 0) { + size = len > PAGE_SIZE ? PAGE_SIZE : len; + get_page(page); + __bio_add_page(bio, page, size, 0); + len -= size; + pos += size; + } bio_set_op_attrs(bio, REQ_OP_WRITE, flags); iomap_dio_submit_bio(dio, iomap, bio, pos); } @@ -227,8 +234,9 @@ static loff_t iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) { + bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); - unsigned int fs_block_size = i_blocksize(inode), pad; + unsigned int zeroing_size, pad; unsigned int align = iov_iter_alignment(dio->submit.iter); struct bio *bio; bool need_zeroout = false; @@ -237,6 +245,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, size_t copied = 0; size_t orig_count; + zeroing_size = i_blocksize(inode) << iomap->extent_shift; + if ((pos | length | align) & ((1 << blkbits) - 1)) return -EINVAL; @@ -280,7 +290,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (need_zeroout) { /* zero out from the start of the block to the write offset */ - pad = pos & (fs_block_size - 1); + pad = pos & (zeroing_size - 1); if (pad) iomap_dio_zero(dio, iomap, pos - pad, pad); } @@ -298,6 +308,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; + bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -314,8 +325,16 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } n = bio->bi_iter.bi_size; + if (is_atomic && (n != orig_count)) { + /* This bio should have covered the complete length */ + ret = -EINVAL; + bio_put(bio); + goto out; + } if (dio->flags & IOMAP_DIO_WRITE) { bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (is_atomic) + bio->bi_opf |= REQ_ATOMIC; if (use_fua) bio->bi_opf |= REQ_FUA; else @@ -345,9 +364,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ - pad = pos & (fs_block_size - 1); + pad = pos & (zeroing_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + iomap_dio_zero(dio, iomap, pos, zeroing_size - pad); } out: /* Undo iter limitation to current extent */ diff --git a/fs/read_write.c b/fs/read_write.c index 371a5a76f30e05d4728480012c07ec5c3bcb9661..da03b3e65cf3be6ab98bc26302e8ac9109ebef8f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 15e9e335d1672823c613f76fda6c926eb0cabd95..704d457eed9345f4ae8d618504b5de0a9884f0f8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3487,6 +3487,18 @@ xfs_bmap_btalloc( args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + /* + * xfs_get_cowextsz_hint() returns extsz_hint for when forcealign is + * set as forcealign and cowextsz_hint are mutually exclusive + */ + if (xfs_inode_forcealign(ap->ip) && align) { + args.alignment = align; + if (stripe_align == 0 || stripe_align % align) + stripe_align = align; + } else { + args.alignment = 1; + } + /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); blen = 0; @@ -3577,7 +3589,6 @@ xfs_bmap_btalloc( args.minalignslop = 0; } } else { - args.alignment = 1; args.minalignslop = 0; } args.postallocs = 1; @@ -3604,7 +3615,8 @@ xfs_bmap_btalloc( if ((error = xfs_alloc_vextent(&args))) return error; } - if (isaligned && args.fsbno == NULLFSBLOCK) { + + if (isaligned && args.fsbno == NULLFSBLOCK && args.alignment <= 1) { /* * allocation failed, so turn off alignment and * try again. @@ -5276,6 +5288,12 @@ __xfs_bunmapi( XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); end = start + len; + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1 + && S_ISREG(VFS_I(ip)->i_mode)) { + start = roundup_64(start, ip->i_d.di_extsize); + end = rounddown_64(end, ip->i_d.di_extsize); + len = end - start; + } if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { *rlen = 0; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 54832df8540f85b706389dcf996cff6e1ad6f520..8e4d4959588410eb528d180be3943d3dbf49bb26 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -353,11 +353,17 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */ +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */ +#define XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES (1 << 31) /* atomicwrites enabled */ + #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ XFS_SB_FEAT_RO_COMPAT_REFLINK| \ - XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + XFS_SB_FEAT_RO_COMPAT_INOBTCNT| \ + XFS_SB_FEAT_RO_COMPAT_FORCEALIGN| \ + XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) + #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -972,15 +978,20 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define XFS_DIFLAG2_FORCEALIGN_BIT 5 +#define XFS_DIFLAG2_ATOMICWRITES_BIT 6 #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT) +#define XFS_DIFLAG2_ATOMICWRITES (1 << XFS_DIFLAG2_ATOMICWRITES_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_FORCEALIGN | XFS_DIFLAG2_ATOMICWRITES) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 0970ae3fe5382770e5e675ba88de009c5370a04b..dd9e5de65d52a48434278c6cc7aed8f9f20f7d9d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -574,6 +574,14 @@ xfs_dinode_verify( !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_FORCEALIGN) { + fa = xfs_inode_validate_forcealign(mp, mode, flags, + be32_to_cpu(dip->di_extsize), + be32_to_cpu(dip->di_cowextsize)); + if (fa) + return fa; + } + return NULL; } @@ -699,3 +707,35 @@ xfs_inode_validate_cowextsize( return NULL; } + +/* Validate the forcealign inode flag */ +xfs_failaddr_t +xfs_inode_validate_forcealign( + struct xfs_mount *mp, + uint16_t mode, + uint16_t flags, + uint32_t extsize, + uint32_t cowextsize) +{ + /* superblock rocompat feature flag */ + if (!xfs_has_forcealign(mp)) + return __this_address; + + /* Only regular files and directories */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* Doesn't apply to realtime files */ + if (flags & XFS_DIFLAG_REALTIME) + return __this_address; + + /* Requires a non-zero power-of-2 extent size hint */ + if (extsize == 0 || !is_power_of_2(extsize)) + return __this_address; + + /* Requires no cow extent size hint */ + if (cowextsize != 0) + return __this_address; + + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 05c3640e135a29a3b4f207769095b44ff045c398..1bcf1415a4b5f5c8ec1333b6f552228a70cddcf4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -62,6 +62,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, uint32_t cowextsize, uint16_t mode, uint16_t flags, uint64_t flags2); +xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp, + uint16_t mode, uint16_t flags, uint32_t extsize, + uint32_t cowextsize); static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index c099ccf2787dbdefeed8584389c51ac40ec146b3..a4354504986ce61be4c528e7205284268abf21ac 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -116,6 +116,10 @@ xfs_sb_version_to_features( features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN) + features |= XFS_FEAT_FORCEALIGN; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_ATOMICWRITES) + features |= XFS_FEAT_ATOMICWRITES; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index edf62092125c62a932fe846f8e3492e464b86ccd..5879f03b8660809f1be3945d55cf040ad71e56d8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -654,6 +654,9 @@ xfs_free_eofblocks( * of the file. If not, then there is nothing to do. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + /* Do not free blocks when forcing extent sizes */ + if (xfs_get_extsz(ip) > 1) + end_fsb = roundup_64(end_fsb, xfs_get_extsz(ip)); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return 0; @@ -925,8 +928,11 @@ xfs_free_file_space( startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + /* Free only complete extents. */ + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) { + startoffset_fsb = roundup_64(startoffset_fsb, ip->i_d.di_extsize); + endoffset_fsb = rounddown_64(endoffset_fsb, ip->i_d.di_extsize); + } else if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { startoffset_fsb = roundup_64(startoffset_fsb, mp->m_sb.sb_rextsize); endoffset_fsb = rounddown_64(endoffset_fsb, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 52643eac5d46ee692b1d171eec8effaa511e503c..10a829ffd908d131169b16e753aa5c3a4651152f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -60,7 +60,10 @@ xfs_is_falloc_aligned( } mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; } else { - mask = mp->m_sb.sb_blocksize - 1; + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1) + mask = (mp->m_sb.sb_blocksize * ip->i_d.di_extsize) - 1; + else + mask = mp->m_sb.sb_blocksize - 1; } return !((pos | len) & mask); @@ -586,6 +589,14 @@ xfs_file_dio_aio_write( size_t count = iov_iter_count(from); struct xfs_buftarg *target = xfs_inode_buftarg(ip); + if (iocb->ki_flags & IOCB_ATOMIC) { + if (!generic_atomic_write_valid(iocb->ki_pos, from, + i_blocksize(inode), + XFS_FSB_TO_B(mp, xfs_get_extsz(ip)))) { + return -EINVAL; + } + } + /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; @@ -597,8 +608,8 @@ xfs_file_dio_aio_write( * the inode as necessary for EOF zeroing cases and fill out the new * inode size as appropriate. */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) { + if ((iocb->ki_pos & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1)) || + ((iocb->ki_pos + count) & (XFS_FSB_TO_B(mp, xfs_get_extsz(ip)) - 1))) { unaligned_io = 1; /* @@ -803,6 +814,9 @@ xfs_file_write_iter( return xfs_file_dax_write(iocb, from); if (iocb->ki_flags & IOCB_DIRECT) { + + if (xfs_inode_atomicwrites(ip)) + iocb->ki_flags |= IOCB_ATOMIC; /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink @@ -1179,6 +1193,25 @@ xfs_file_remap_range( return remapped > 0 ? remapped : ret; } +static bool xfs_file_open_can_atomicwrite( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (!(file->f_flags & O_DIRECT)) + return false; + + if (!xfs_inode_atomicwrites(ip)) + return false; + + if (!bdev_can_atomic_write(target->bt_bdev)) + return false; + + return true; +} + STATIC int xfs_file_open( struct inode *inode, @@ -1189,6 +1222,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + if (xfs_file_open_can_atomicwrite(inode, file)) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 268bbc2d978b8cb2d4d052905dfe0d63f7d9ace7..80e1503ca8c4102343b91e35722d7e9b8b3c6500 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -68,6 +68,20 @@ xfs_get_extsz_hint( return 0; } +/* + * Helper function to extract extent size. It will return a power-of-2, + * as forcealign requires this. + */ +xfs_extlen_t +xfs_get_extsz( + struct xfs_inode *ip) +{ + if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + + return 1; +} + /* * Helper function to extract CoW extent size hint from inode. * Between the extent size hint and the CoW extent size hint, we @@ -643,6 +657,10 @@ _xfs_dic2xflags( flags |= FS_XFLAG_DAX; if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) flags |= FS_XFLAG_COWEXTSIZE; + if (di_flags2 & XFS_DIFLAG2_FORCEALIGN) + flags |= FS_XFLAG_FORCEALIGN; + if (di_flags2 & XFS_DIFLAG2_ATOMICWRITES) + flags |= FS_XFLAG_ATOMICWRITES; } if (has_attr) @@ -759,6 +777,18 @@ xfs_inode_inherit_flags2( } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) + ip->i_d.di_flags2 |= XFS_DIFLAG2_FORCEALIGN; + + if (ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN) { + xfs_failaddr_t failaddr; + + failaddr = xfs_inode_validate_forcealign(ip->i_mount, + VFS_I(ip)->i_mode, ip->i_d.di_flags, ip->i_d.di_extsize, + ip->i_d.di_cowextsize); + if (failaddr) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_FORCEALIGN; + } } /* @@ -3956,3 +3986,16 @@ xfs_iunlock2_io_mmap( if (!same_inode) inode_unlock(VFS_I(ip1)); } + +/* Returns the size of fundamental allocation unit for a file, in bytes. */ +unsigned int +xfs_inode_alloc_unitsize( + struct xfs_inode *ip) +{ + unsigned int blocks = 1; + + if (XFS_IS_REALTIME_INODE(ip)) + blocks = ip->i_mount->m_sb.sb_rextsize; + + return XFS_FSB_TO_B(ip->i_mount, blocks); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b552daae323fcdcac75abddcc069285ab46f5cbd..818f7622d8518a7eaa8c76170591110fe0484059 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -268,6 +268,16 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_forcealign(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_FORCEALIGN; +} + +static inline bool xfs_inode_atomicwrites(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_ATOMICWRITES; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -489,6 +499,7 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_extsz(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, @@ -560,5 +571,6 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2337eb272235407a7e2f7b927097a52c6b8fb516..939b91124ce12677d8bb5dcfcca5a60558c0413a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1198,6 +1198,10 @@ xfs_flags2diflags2( di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + if (xflags & FS_XFLAG_FORCEALIGN) + di_flags2 |= XFS_DIFLAG2_FORCEALIGN; + if (xflags & FS_XFLAG_ATOMICWRITES) + di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; return di_flags2; } @@ -1210,10 +1214,12 @@ xfs_ioctl_setattr_xflags( { struct xfs_mount *mp = ip->i_mount; uint64_t di_flags2; + bool atomic_writes = fa->fsx_xflags & FS_XFLAG_ATOMICWRITES; - /* Can't change realtime flag if any extents are allocated. */ + /* Can't change realtime or atomic flag if any extents are allocated. */ if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) + (XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME) || + atomic_writes != xfs_inode_atomicwrites(ip))) return -EINVAL; /* If realtime flag is set then must have realtime device */ @@ -1236,6 +1242,29 @@ xfs_ioctl_setattr_xflags( if (di_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; + /* + * Force-align requires a nonzero extent size hint and a zero cow + * extent size hint. It doesn't apply to realtime files. + */ + if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) { + if (!xfs_has_forcealign(mp)) + return -EINVAL; + if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) + return -EINVAL; + if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE | + FS_XFLAG_EXTSZINHERIT))) + return -EINVAL; + if (fa->fsx_xflags & FS_XFLAG_REALTIME) + return -EINVAL; + } + + if (atomic_writes) { + if (!xfs_has_atomicwrites(mp)) + return -EINVAL; + if (!(fa->fsx_xflags & FS_XFLAG_FORCEALIGN)) + return -EINVAL; + } + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; @@ -1339,6 +1368,9 @@ xfs_ioctl_setattr_check_extsize( struct xfs_mount *mp = ip->i_mount; xfs_extlen_t size; xfs_fsblock_t extsize_fsb; + xfs_failaddr_t failaddr; + uint16_t new_diflags; + uint16_t new_diflags2; if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) @@ -1363,6 +1395,17 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) { + failaddr = xfs_inode_validate_forcealign(ip->i_mount, + VFS_I(ip)->i_mode, new_diflags, + XFS_B_TO_FSB(mp, fa->fsx_extsize), + XFS_B_TO_FSB(mp, fa->fsx_cowextsize)); + if (failaddr) + return -EINVAL; + } + return 0; } @@ -1606,6 +1649,10 @@ xfs_ioc_setxflags( } xfs_fill_fsxattr(ip, false, &old_fa); + fa.fsx_extsize = old_fa.fsx_extsize; + fa.fsx_cowextsize = old_fa.fsx_cowextsize; + fa.fsx_projid = old_fa.fsx_projid; + fa.fsx_nextents = old_fa.fsx_nextents; error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa); if (error) { xfs_trans_cancel(tp); @@ -2069,6 +2116,26 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioc_set_atomic_write( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + tp = xfs_ioctl_setattr_get_trans(ip, NULL); + if (IS_ERR(tp)) { + error = PTR_ERR(tp); + goto out; + } + + ip->i_d.di_flags2 |= XFS_DIFLAG2_ATOMICWRITES; + + error = xfs_trans_commit(tp); +out: + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -2096,6 +2163,31 @@ xfs_file_ioctl( return xfs_ioc_getlabel(mp, arg); case FS_IOC_SETFSLABEL: return xfs_ioc_setlabel(filp, mp, arg); + case FS_IOC_SETATOMIC: + if (!xfs_has_atomicwrites(mp)) + return -1; + if (!S_ISREG(inode->i_mode)) + return -1; + if (xfs_inode_atomicwrites(ip)) + return 0; + if (!xfs_inode_forcealign(ip)) + return -1; + + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + error = xfs_ioc_set_atomic_write(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + if (error) { + xfs_alert(mp, "%s: set ino 0x%llx atomic write fail!", + __func__, XFS_I(inode)->i_ino); + return -1; + } else { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if ((filp->f_flags & O_DIRECT) && + bdev_can_atomic_write(target->bt_bdev)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + return 0; + } case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 76285db4aaec4a642901134d447a761bb32d010b..dc01689988ab68baf060aaf3ca768c7814e59407 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -90,6 +90,7 @@ xfs_bmbt_to_iomap( { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); + xfs_extlen_t extsz = xfs_get_extsz(ip); if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); @@ -120,6 +121,8 @@ xfs_bmbt_to_iomap( iomap->validity_cookie = sequence_cookie; iomap->page_ops = &xfs_iomap_page_ops; + if (extsz > 1) + iomap->extent_shift = ffs(extsz) - 1; return 0; } @@ -167,7 +170,9 @@ xfs_eof_alignment( * If mounted with the "-o swalloc" option the alignment is * increased from the strip unit size to the stripe width. */ - if (mp->m_swidth && xfs_has_swalloc(mp)) + if (xfs_inode_forcealign(ip)) + align = xfs_get_extsz_hint(ip); + else if (mp->m_swidth && xfs_has_swalloc(mp)) align = mp->m_swidth; else if (mp->m_dalign) align = mp->m_dalign; @@ -544,11 +549,19 @@ xfs_iomap_write_unwritten( xfs_fsize_t i_size; uint resblks; int error; + xfs_extlen_t extsz = xfs_get_extsz(ip); trace_xfs_unwritten_convert(ip, offset, count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + if (extsz > 1) { + xfs_extlen_t extsize_bytes = XFS_FSB_TO_B(mp, extsz); + + offset_fsb = XFS_B_TO_FSBT(mp, round_down(offset, extsize_bytes)); + count_fsb = XFS_B_TO_FSB(mp, round_up(offset + count, extsize_bytes)); + } else { + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + } count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a527a544a684170dd3ce57c1ddf09ddb0b0dfcc5..b04f7ca882d452e8c5e6163e4db7755bc09c3044 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -769,6 +769,7 @@ xfs_setattr_size( int error; uint lock_flags = 0; bool did_zeroing = false; + bool write_back = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); @@ -805,21 +806,10 @@ xfs_setattr_size( */ inode_dio_wait(inode); - /* - * File data changes must be complete before we start the transaction to - * modify the inode. This needs to be done before joining the inode to - * the transaction because the inode cannot be unlocked once it is a - * part of the transaction. - * - * Start with zeroing any data beyond EOF that we may expose on file - * extension, or zeroing out the rest of the block on a downward - * truncate. - */ - if (newsize > oldsize) { - trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); - error = iomap_zero_range(inode, oldsize, newsize - oldsize, - &did_zeroing, &xfs_buffered_write_iomap_ops); - } else { + write_back = newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size; + if (newsize < oldsize) { + unsigned int blocksize = xfs_inode_alloc_unitsize(ip); + /* * iomap won't detect a dirty page over an unwritten block (or a * cow block over a hole) and subsequently skips zeroing the @@ -827,53 +817,67 @@ xfs_setattr_size( * convert the block before the pagecache truncate. */ error = filemap_write_and_wait_range(inode->i_mapping, newsize, - newsize); + roundup_64(newsize, blocksize) - 1); if (error) return error; - error = iomap_truncate_page(inode, newsize, &did_zeroing, - &xfs_buffered_write_iomap_ops); - } + error = iomap_truncate_page(inode, newsize, blocksize, + &did_zeroing, &xfs_buffered_write_iomap_ops); - if (error) - return error; + /* + * We are going to log the inode size change in this transaction + * so any previous writes that are beyond the on disk EOF and + * the new EOF that have not been written out need to be written + * here. If we do not write the data out, we expose ourselves + * to the null files problem. Note that this includes any block + * zeroing we did above; otherwise those blocks may not be + * zeroed after a crash. + */ + if (did_zeroing || write_back) { + error = filemap_write_and_wait_range(inode->i_mapping, + min_t(loff_t, ip->i_d.di_size, newsize), + roundup_64(newsize, blocksize) - 1); + if (error) + return error; + } - /* - * We've already locked out new page faults, so now we can safely remove - * pages from the page cache knowing they won't get refaulted until we - * drop the XFS_MMAP_EXCL lock after the extent manipulations are - * complete. The truncate_setsize() call also cleans partial EOF page - * PTEs on extending truncates and hence ensures sub-page block size - * filesystems are correctly handled, too. - * - * We have to do all the page cache truncate work outside the - * transaction context as the "lock" order is page lock->log space - * reservation as defined by extent allocation in the writeback path. - * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but - * having already truncated the in-memory version of the file (i.e. made - * user visible changes). There's not much we can do about this, except - * to hope that the caller sees ENOMEM and retries the truncate - * operation. - * - * And we update in-core i_size and truncate page cache beyond newsize - * before writeback the [di_size, newsize] range, so we're guaranteed - * not to write stale data past the new EOF on truncate down. - */ - truncate_setsize(inode, newsize); + /* + * Updating i_size after writing back to make sure the zeroed + * blocks could been written out, and drop all the page cache + * range that beyond blocksize aligned new EOF block. + * + * We've already locked out new page faults, so now we can + * safely remove pages from the page cache knowing they won't + * get refaulted until we drop the XFS_MMAP_EXCL lock after the + * extent manipulations are complete. + */ + i_size_write(inode, newsize); + truncate_pagecache(inode, roundup_64(newsize, blocksize)); + } else { + /* + * Start with zeroing any data beyond EOF that we may expose on + * file extension. + */ + if (newsize > oldsize) { + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); + error = iomap_zero_range(inode, oldsize, newsize - oldsize, + &did_zeroing, &xfs_buffered_write_iomap_ops); + if (error) + return error; + } - /* - * We are going to log the inode size change in this transaction so - * any previous writes that are beyond the on disk EOF and the new - * EOF that have not been written out need to be written here. If we - * do not write the data out, we expose ourselves to the null files - * problem. Note that this includes any block zeroing we did above; - * otherwise those blocks may not be zeroed after a crash. - */ - if (did_zeroing || - (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ip->i_d.di_size, newsize - 1); - if (error) - return error; + /* + * The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block + * size filesystems are correctly handled, too. + */ + truncate_setsize(inode, newsize); + + if (did_zeroing || write_back) { + error = filemap_write_and_wait_range(inode->i_mapping, + ip->i_d.di_size, newsize - 1); + if (error) + return error; + } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 21547ff97b5a6782fae4c06f1b2fdec74acf2748..888d6bf9bea7adf2dd30ba73b74e7b2238ce7ed8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -274,6 +274,9 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_FORCEALIGN (1ULL << 27) /* aligned file data extents */ +#define XFS_FEAT_ATOMICWRITES (1ULL << 28) /* atomic writes support */ + /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -336,6 +339,8 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(forcealign, FORCEALIGN) +__XFS_HAS_FEAT(atomicwrites, ATOMICWRITES) /* * Mount features diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 502fb08bfd3812fafea2775a24be40b24da83872..d43f76a4b99a433f17697cf44ed2507cef076d35 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1639,6 +1639,7 @@ xfs_fc_fill_super( "DAX unsupported by block device. Turning off DAX."); xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } + if (xfs_has_reflink(mp)) { xfs_alert(mp, "DAX and reflink cannot be used together!"); @@ -1657,6 +1658,14 @@ xfs_fc_fill_super( } } + if (xfs_has_forcealign(mp)) + xfs_warn(mp, +"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!"); + + if (xfs_has_atomicwrites(mp)) + xfs_warn(mp, +"EXPERIMENTAL atomicwrites feature in use. Use at your own risk!"); + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1853ec569b72d2d6dd52f4f8730f9ac2ebf77370..996e03d50f41ae609e1491f6de6e1c2876ffca8f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -133,6 +133,11 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)19) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -421,6 +426,7 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ + __REQ_ATOMIC, /* for atomic write operations */ __REQ_NR_BITS, /* stops here */ }; @@ -445,6 +451,7 @@ enum req_flag_bits { #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) +#define REQ_ATOMIC (1ULL << __REQ_ATOMIC) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50b4fd0a06873b1cb196ddf37be26e343c5c7ea4..31e8e98d499096c81b482389151e613a8f31cb67 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,17 @@ enum blk_zoned_model { BLK_ZONED_HM, /* Host-managed zoned block device */ }; +struct queue_atomic_write_limits { + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -355,7 +366,11 @@ struct queue_limits { unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; +#ifndef __GENKSYMS__ + struct queue_atomic_write_limits *aw_limits; +#else KABI_RESERVE(1) +#endif }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -1082,9 +1097,11 @@ static inline struct bio_vec req_bvec(struct request *rq) return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); } -static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - int op) +static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { + struct request_queue *q = rq->q; + int op = req_op(rq); + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); @@ -1095,6 +1112,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.aw_limits->atomic_write_max_sectors; + return q->limits.max_sectors; } @@ -1132,10 +1152,10 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); + return blk_queue_get_max_sectors(rq); return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); + blk_queue_get_max_sectors(rq)); } static inline unsigned int blk_rq_count_bios(struct request *rq) @@ -1185,6 +1205,14 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q, extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); +void blk_queue_atomic_write_max_bytes(struct request_queue *q, + unsigned int bytes); +void blk_queue_atomic_write_boundary_bytes(struct request_queue *q, + unsigned int bytes); +void blk_queue_atomic_write_unit_max_bytes(struct request_queue *q, + unsigned int bytes); +void blk_queue_atomic_write_unit_min_bytes(struct request_queue *q, + unsigned int bytes); void blk_queue_update_readahead(struct request_queue *q); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); @@ -1192,6 +1220,7 @@ extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); +extern void blk_set_default_atomic_write_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); @@ -1647,6 +1676,30 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return 0; } +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_hw_boundary; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.aw_limits->atomic_write_max_sectors << SECTOR_SHIFT; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; @@ -2100,4 +2153,24 @@ int fsync_bdev(struct block_device *bdev); struct super_block *freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev, struct super_block *sb); +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev_get_queue(bdev); + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->aw_limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_part->start_sect; + unsigned int alignment = + max(limits->aw_limits->atomic_write_unit_min, + limits->aw_limits->atomic_write_hw_boundary); + if (!IS_ALIGNED(bd_start_sect, alignment)) + return false; + } + + return true; +} + #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 382a0d4dd3dd3a4b9e3cfa4925bf1ee104615f8b..5e37ba8550834040a41706981851c55ce1a2168a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)0x80000000) + /* File mode control flag, expect random access pattern */ #define FMODE_CTL_RANDOM ((__force fmode_t)0x1000) @@ -320,6 +324,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -3406,7 +3411,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3423,6 +3429,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3665,4 +3677,23 @@ static inline void fs_file_read_do_trace(struct kiocb *iocb) if (tracepoint_enabled(fs_file_read)) fs_file_read_update_args_by_trace(iocb); } + +static inline +bool generic_atomic_write_valid(loff_t pos, struct iov_iter *iter, + unsigned int unit_min, unsigned int unit_max) +{ + size_t len = iov_iter_count(iter); + + if (len < unit_min || len > unit_max) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0965d5f12858e3d9546d0baa98ac5f128dd604f1..1b6e22741d43002e8ecbad1bc980da64101eb858 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -93,6 +93,7 @@ struct iomap { u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ + unsigned int extent_shift; struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; @@ -206,8 +207,8 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); -int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); +int iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize, + bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/include/linux/math64.h b/include/linux/math64.h index 66deb1fdc2ef641d1c4a917c212585be80d94751..b5c4d1df08e522248bca845f84f00c039f09ab4e 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -3,6 +3,7 @@ #define _LINUX_MATH64_H #include +#include #include #include @@ -11,6 +12,20 @@ #define div64_long(x, y) div64_s64((x), (y)) #define div64_ul(x, y) div64_u64((x), (y)) +/** + * rem_u64 - remainder of unsigned 64bit divide with 32bit divisor + * @dividend: unsigned 64bit dividend + * @divisor: unsigned 32bit divisor + * + * Return: dividend % divisor + */ +static inline u32 rem_u64(u64 dividend, u32 divisor) +{ + if (is_power_of_2(divisor)) + return dividend & (divisor - 1); + return dividend % divisor; +} + /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder * @dividend: unsigned 64bit dividend @@ -85,6 +100,15 @@ static inline s64 div64_s64(s64 dividend, s64 divisor) #define div64_long(x, y) div_s64((x), (y)) #define div64_ul(x, y) div_u64((x), (y)) +#ifndef rem_u64 +static inline u32 rem_u64(u64 dividend, u32 divisor) +{ + if (is_power_of_2(divisor)) + return dividend & (divisor - 1); + return do_div(dividend, divisor); +} +#endif + #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f44eb0a04afdd8cea369af1395c3637a5f69122d..332b0709756b01e60a6d6480b1f0aca345d92d67 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -140,6 +140,9 @@ struct fsxattr { #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ +/* data extent mappings for regular files must be aligned to extent size hint */ +#define FS_XFLAG_FORCEALIGN 0x00020000 +#define FS_XFLAG_ATOMICWRITES 0x00040000 /* atomic writes enabled */ #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* the read-only stuff doesn't really belong here, but any other place is @@ -214,6 +217,7 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +#define FS_IOC_SETATOMIC _IOW(0x95, 2, uint) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -300,8 +304,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_ATOMIC) #endif /* _UAPI_LINUX_FS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65cf70874fb3ea14728ea1666cbb71ed8e6eb486..c284e9865826946e275eaea8006919e0cfcdc273 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2956,7 +2956,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags), rw); if (unlikely(ret)) return ret;