diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 1fe9a553c37b71779a52f24627307d9003079777..e5fedecf7bdf327eb1894f6517d59e103af5d714 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -358,8 +358,12 @@ What: /sys/block//queue/iostats Date: January 2009 Contact: linux-block@vger.kernel.org Description: - [RW] This file is used to control (on/off) the iostats - accounting of the disk. + [RW] This file is used to control the iostats accounting of the + disk. If this value is 0, iostats accounting is disabled; If + this value is 1, iostats accounting is enabled, but io_ticks is + accounted by sampling and the result is not accurate; If this + value is 2, iostats accounting is enabled and io_ticks is + accounted precisely, but there will be slightly overhead. What: /sys/block//queue/logical_block_size diff --git a/block/blk-core.c b/block/blk-core.c index fdf25b8d6e784f9904ee7892277acdb25430f3d3..13a27d1d8540a434a07ffb195739a1054a43edbc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -71,6 +71,21 @@ static struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static bool precise_iostat; + +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -441,6 +456,8 @@ struct request_queue *blk_alloc_queue(int node_id) blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; + if (precise_iostat) + blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q); return q; @@ -938,11 +955,15 @@ EXPORT_SYMBOL_GPL(iocb_bio_iopoll); void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; + bool precise = blk_queue_precise_io_stat(part->bd_queue); again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) { + if (end || (precise && part_in_flight(part))) + __part_stat_add(part, io_ticks, now - stamp); + else if (!precise) + __part_stat_add(part, io_ticks, 1); } if (part->bd_partno) { part = bdev_whole(part); diff --git a/block/blk-merge.c b/block/blk-merge.c index 65e75efa9bd3663c787b4b0c5b310479419467a2..5db8228c46fcf9f834d0c326b78200b1158f3107 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -783,6 +783,9 @@ static void blk_account_io_merge_request(struct request *req) if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + if (req->rq_flags & RQF_PRECISE_IO_STAT) + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index c3b5930106b288bec70850a90f88f2fdc1ed5a38..ed64a3b16484b54da484523b14268f2aa13c4b03 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -86,6 +86,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), + QUEUE_FLAG_NAME(PRECISE_IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SYNCHRONOUS), @@ -253,6 +254,7 @@ static const char *const rqf_name[] = { RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(IO_STAT), + RQF_NAME(PRECISE_IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), diff --git a/block/blk-mq.c b/block/blk-mq.c index 6ab7f360ff2ac711bdadb9b6492dc80cb90759fa..0740ba04851f5f41ff22d24db463fd4ab2678254 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -360,8 +360,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) + if (blk_queue_io_stat(q)) { data->rq_flags |= RQF_IO_STAT; + if (blk_queue_precise_io_stat(q)) + data->rq_flags |= RQF_PRECISE_IO_STAT; + } rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -994,6 +997,9 @@ static inline void blk_account_io_done(struct request *req, u64 now) update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + if (req->rq_flags & RQF_PRECISE_IO_STAT) + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -1001,7 +1007,6 @@ static inline void blk_account_io_done(struct request *req, u64 now) static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); - if (blk_do_io_stat(req)) { /* * All non-passthrough requests are created from a bio with one @@ -1016,6 +1021,9 @@ static inline void blk_account_io_start(struct request *req) part_stat_lock(); update_io_ticks(req->part, jiffies, false); + if (req->rq_flags & RQF_PRECISE_IO_STAT) + part_stat_local_inc(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 63e4812623361ddde759809c4e04b715aa871e43..d498cf0942d3796258fb3d7c250cd05773c12540 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -303,7 +303,6 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); -QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS @@ -473,6 +472,45 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t queue_iostats_show(struct request_queue *q, char *page) +{ + int val = 0; + + if (blk_queue_io_stat(q)) + val = blk_queue_precise_io_stat(q) ? 2 : 1; + + return sprintf(page, "%u\n", val); +} + +static ssize_t +queue_iostats_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long nr; + int ret = queue_var_store(&nr, page, count); + + if (ret < 0) + return ret; + + switch (nr) { + case 0: + blk_queue_flag_clear(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q); + break; + case 1: + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q); + break; + case 2: + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q); + break; + default: + return -EINVAL; + } + + return count; +} + #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ @@ -494,6 +532,7 @@ QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); +QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); @@ -539,7 +578,6 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { }; QUEUE_RW_ENTRY(queue_nonrot, "rotational"); -QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); diff --git a/block/blk.h b/block/blk.h index 08a358bc0919e2e8f5a53bbca4519ba6ad9d0b15..67915b04b3c179329d34af45a48f2d4a6264c35c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -344,6 +344,7 @@ static inline bool blk_do_io_stat(struct request *rq) } void update_io_ticks(struct block_device *part, unsigned long now, bool end); +unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { diff --git a/block/genhd.c b/block/genhd.c index 4a16a424f57d4f77b115e985a4db14d337c8ceba..b3ff653f3e50c180f98d4abaa80f234c42c30a7b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part, } } -static unsigned int part_in_flight(struct block_device *part) +unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 958ed7e89b301e0fa968ed27dd8ae764d47ab68f..71833b174a9d02936413bab03c09f9cfc473687f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -46,6 +46,7 @@ typedef __u32 __bitwise req_flags_t; #define RQF_QUIET ((__force req_flags_t)(1 << 11)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +#define RQF_PRECISE_IO_STAT ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9f3bcbcb156d797d616fd603bf2d1f6176e2bf92..bea0b5fdac7475c5236e059e4ef082d2d3628754 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -536,6 +536,7 @@ struct request_queue { #define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ +#define QUEUE_FLAG_PRECISE_IO_STAT 8 /* do disk/partitions IO accounting precisely */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ @@ -576,6 +577,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stable_writes(q) \ test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) +#define blk_queue_precise_io_stat(q) \ + test_bit(QUEUE_FLAG_PRECISE_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)