diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 1fe9a553c37b71779a52f24627307d9003079777..e5fedecf7bdf327eb1894f6517d59e103af5d714 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -358,8 +358,12 @@ What: /sys/block//queue/iostats Date: January 2009 Contact: linux-block@vger.kernel.org Description: - [RW] This file is used to control (on/off) the iostats - accounting of the disk. + [RW] This file is used to control the iostats accounting of the + disk. If this value is 0, iostats accounting is disabled; If + this value is 1, iostats accounting is enabled, but io_ticks is + accounted by sampling and the result is not accurate; If this + value is 2, iostats accounting is enabled and io_ticks is + accounted precisely, but there will be slightly overhead. What: /sys/block//queue/logical_block_size diff --git a/block/blk-core.c b/block/blk-core.c index fdf25b8d6e784f9904ee7892277acdb25430f3d3..d599d9c3fddedbbb00b9039d1025329e295232c3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -71,6 +71,21 @@ static struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static bool precise_iostat; + +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -441,6 +456,8 @@ struct request_queue *blk_alloc_queue(int node_id) blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; + if (precise_iostat) + blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q); return q; @@ -935,14 +952,20 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -void update_io_ticks(struct block_device *part, unsigned long now, bool end) +void update_io_ticks(struct block_device *part, unsigned long now, bool end, + bool precise) { unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) { + if (precise) { + if (end || part_in_flight(part)) + __part_stat_add(part, io_ticks, now - stamp); + } else { __part_stat_add(part, io_ticks, end ? now - stamp : 1); + } } if (part->bd_partno) { part = bdev_whole(part); @@ -954,7 +977,8 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { part_stat_lock(); - update_io_ticks(bdev, start_time, false); + update_io_ticks(bdev, start_time, false, + blk_queue_precise_io_stat(bdev->bd_queue)); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -982,7 +1006,7 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned long duration = now - start_time; part_stat_lock(); - update_io_ticks(bdev, now, true); + update_io_ticks(bdev, now, true, true); part_stat_inc(bdev, ios[sgrp]); part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); diff --git a/block/blk-merge.c b/block/blk-merge.c index 65e75efa9bd3663c787b4b0c5b310479419467a2..5db8228c46fcf9f834d0c326b78200b1158f3107 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -783,6 +783,9 @@ static void blk_account_io_merge_request(struct request *req) if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + if (req->rq_flags & RQF_PRECISE_IO_STAT) + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6ab7f360ff2ac711bdadb9b6492dc80cb90759fa..8cff2eaf3c35f9992e91cdd7fadbb7be64439974 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -360,8 +360,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) + if (blk_queue_io_stat(q)) { data->rq_flags |= RQF_IO_STAT; + if (blk_queue_precise_io_stat(q)) + data->rq_flags |= RQF_PRECISE_IO_STAT; + } rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -991,17 +994,21 @@ static inline void blk_account_io_done(struct request *req, u64 now) const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); - update_io_ticks(req->part, jiffies, true); + update_io_ticks(req->part, jiffies, true, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + if (req->rq_flags & RQF_PRECISE_IO_STAT) + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } static inline void blk_account_io_start(struct request *req) { - trace_block_io_start(req); + bool precise = req->rq_flags & RQF_PRECISE_IO_STAT; + trace_block_io_start(req); if (blk_do_io_stat(req)) { /* * All non-passthrough requests are created from a bio with one @@ -1015,7 +1022,10 @@ static inline void blk_account_io_start(struct request *req) req->part = req->q->disk->part0; part_stat_lock(); - update_io_ticks(req->part, jiffies, false); + update_io_ticks(req->part, jiffies, false, precise); + if (precise) + part_stat_local_inc(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 63e4812623361ddde759809c4e04b715aa871e43..d3e07d86575515b6fa9676b1dc08f78b8886ed1b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -303,7 +303,6 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); -QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS @@ -473,6 +472,43 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t queue_iostats_show(struct request_queue *q, char *page) +{ + int val = 0; + + if (blk_queue_io_stat(q)) + val = blk_queue_precise_io_stat(q) ? 2 : 1; + + return sprintf(page, "%u\n", val); +} + +static ssize_t +queue_iostats_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long nr; + int ret; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr > 2) + return -EINVAL; + + if (nr == 0) { + blk_queue_flag_clear(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q); + } else if (nr == 1) { + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q); + } else { + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); + blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q); + } + + return count; +} + #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ @@ -494,6 +530,7 @@ QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); +QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); @@ -539,7 +576,6 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { }; QUEUE_RW_ENTRY(queue_nonrot, "rotational"); -QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); diff --git a/block/blk.h b/block/blk.h index 08a358bc0919e2e8f5a53bbca4519ba6ad9d0b15..d2321709a9ce3e23fcf0a502b3c3a885115e63ee 100644 --- a/block/blk.h +++ b/block/blk.h @@ -343,7 +343,9 @@ static inline bool blk_do_io_stat(struct request *rq) return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } -void update_io_ticks(struct block_device *part, unsigned long now, bool end); +void update_io_ticks(struct block_device *part, unsigned long now, bool end, + bool precise); +unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { diff --git a/block/genhd.c b/block/genhd.c index 4a16a424f57d4f77b115e985a4db14d337c8ceba..5db94cb326345e07f89be7267de0f7c1b3f87df2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part, } } -static unsigned int part_in_flight(struct block_device *part) +unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -955,7 +955,7 @@ ssize_t part_stat_show(struct device *dev, if (inflight) { part_stat_lock(); - update_io_ticks(bdev, jiffies, true); + update_io_ticks(bdev, jiffies, true, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); @@ -1248,7 +1248,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (inflight) { part_stat_lock(); - update_io_ticks(hd, jiffies, true); + update_io_ticks(hd, jiffies, true, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 958ed7e89b301e0fa968ed27dd8ae764d47ab68f..71833b174a9d02936413bab03c09f9cfc473687f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -46,6 +46,7 @@ typedef __u32 __bitwise req_flags_t; #define RQF_QUIET ((__force req_flags_t)(1 << 11)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +#define RQF_PRECISE_IO_STAT ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9f3bcbcb156d797d616fd603bf2d1f6176e2bf92..bea0b5fdac7475c5236e059e4ef082d2d3628754 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -536,6 +536,7 @@ struct request_queue { #define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ +#define QUEUE_FLAG_PRECISE_IO_STAT 8 /* do disk/partitions IO accounting precisely */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ @@ -576,6 +577,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stable_writes(q) \ test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) +#define blk_queue_precise_io_stat(q) \ + test_bit(QUEUE_FLAG_PRECISE_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)