diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8ad0b27813175ddf716c59cc52b10c71a0132d70..13357edd28df5fc0453d1a3f7d6a7a2aa6b04d3a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2012,6 +2012,30 @@ IO Interface Files 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021 + io.extstat + A read-only nested-keyed file. + + Lines are keyed by $MAJ:$MIN device numbers and not ordered. + The following nested keys are defined. + + ======== ============================= + rwait IO read wait time + wwait IO write wait time + rserv IO read service time + wserv IO write service time + rcomp Number of completed read IOs + wcomp Number of completed write IOs + rbytesq Bytes of queued read IOs + wbytesq Bytes of queued write IOs + riosq Number of queued read IOs + wiosq Number of queued write IOs + ======== ============================= + + An example read output follows:: + + 253:16 rwait=0 wwait=3300 rserv=0 wserv=414366321956 rcomp=0 wcomp=12 rbytesq=0 wbytesq=40960000 riosq=0 wiosq=12 + 253:0 rwait=0 wwait=0 rserv=0 wserv=0 rcomp=0 wcomp=0 rbytesq=0 wbytesq=0 riosq=0 wiosq=0 + io.cost.qos A read-write nested-keyed file which exists only on the root cgroup. diff --git a/block/bio.c b/block/bio.c index 4d500beef061b5777401d6c7e7f4e90c96ecde0f..6db6499119cc07771268a426658886351629056c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -237,6 +237,12 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); +#ifdef CONFIG_BLK_DEV_THROTTLING + bio->start_time_ns = 0; + bio->io_start_time_ns = 0; + bio->bi_tg_end_io = NULL; + bio->bi_tg_private = NULL; +#endif #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif @@ -256,6 +262,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; + bio->bi_ext_flags = 0; } EXPORT_SYMBOL(bio_init); @@ -1792,6 +1799,10 @@ void bio_endio(struct bio *bio) } #endif +#ifdef CONFIG_BLK_DEV_THROTTLING + if (bio->bi_tg_end_io) + bio->bi_tg_end_io(bio); +#endif if (bio->bi_end_io) bio->bi_end_io(bio); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 97188a7958486ffa38e178d072071a9460535abc..f148ea4fd4490a6268bc9d8357fa50f5d80b6215 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -268,11 +268,14 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, if (!tg) return NULL; - if (blkg_rwstat_init(&tg->stat_bytes, gfp)) - goto err_free_tg; - - if (blkg_rwstat_init(&tg->stat_ios, gfp)) - goto err_exit_stat_bytes; + if (blkg_rwstat_init(&tg->stat_bytes, gfp) || + blkg_rwstat_init(&tg->stat_ios, gfp) || + blkg_rwstat_init(&tg->service_time, gfp) || + blkg_rwstat_init(&tg->wait_time, gfp) || + blkg_rwstat_init(&tg->completed, gfp) || + blkg_rwstat_init(&tg->total_bytes_queued, gfp) || + blkg_rwstat_init(&tg->total_io_queued, gfp)) + goto err; throtl_service_queue_init(&tg->service_queue); @@ -289,9 +292,14 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, return &tg->pd; -err_exit_stat_bytes: +err: blkg_rwstat_exit(&tg->stat_bytes); -err_free_tg: + blkg_rwstat_exit(&tg->stat_ios); + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->completed); + blkg_rwstat_exit(&tg->total_bytes_queued); + blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); return NULL; } @@ -318,7 +326,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; - if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + + /* Enable hierarchical throttling even on traditional hierarchy */ + if (blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } @@ -360,9 +370,25 @@ static void throtl_pd_free(struct blkg_policy_data *pd) timer_delete_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->completed); + blkg_rwstat_exit(&tg->total_bytes_queued); + blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); } +static void throtl_pd_reset(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + blkg_rwstat_reset(&tg->service_time); + blkg_rwstat_reset(&tg->wait_time); + blkg_rwstat_reset(&tg->completed); + blkg_rwstat_reset(&tg->total_bytes_queued); + blkg_rwstat_reset(&tg->total_io_queued); +} + static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -915,6 +941,65 @@ static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) return tg_dispatch_iops_time(tg, bio); } +static void throtl_stats_update_completion(struct throtl_grp *tg, + uint64_t start_time, + uint64_t io_start_time, + blk_opf_t opf) +{ + unsigned long flags; + uint64_t now = sched_clock(); + + local_irq_save(flags); + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&tg->service_time, opf, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&tg->wait_time, opf, io_start_time - start_time); + blkg_rwstat_add(&tg->completed, opf, 1); + local_irq_restore(flags); +} + +static void throtl_bio_end_io(struct bio *bio) +{ + struct throtl_grp *tg; + + rcu_read_lock(); + /* see comments in throtl_bio_stats_start() */ + if (!bio_ext_flagged(bio, BIO_THROTL_STATED)) + goto out; + + tg = (struct throtl_grp *)bio->bi_tg_private; + if (!tg) + goto out; + + throtl_stats_update_completion(tg, bio_start_time_ns(bio), + bio_io_start_time_ns(bio), + bio->bi_opf); + blkg_put(tg_to_blkg(tg)); + bio_clear_ext_flag(bio, BIO_THROTL_STATED); +out: + rcu_read_unlock(); +} + +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp *tg) +{ + int op = bio_op(bio); + + /* + * It may happen that end_io will be called twice like dm-thin, + * which will save origin end_io first, and call its overwrite + * end_io and then the saved end_io. We use bio flag + * BIO_THROTL_STATED to do only once statistics. + */ + if ((op == REQ_OP_READ || op == REQ_OP_WRITE) && + !bio_ext_flagged(bio, BIO_THROTL_STATED)) { + blkg_get(tg_to_blkg(tg)); + bio_set_ext_flag(bio, BIO_THROTL_STATED); + bio->bi_tg_end_io = throtl_bio_end_io; + bio->bi_tg_private = tg; + bio_set_start_time_ns(bio); + } +} + /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add @@ -953,6 +1038,9 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, bio == throtl_peek_queued(&sq->queued[rw])) tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; + blkg_rwstat_add(&tg->total_bytes_queued, bio->bi_opf, + throtl_bio_data_size(bio)); + blkg_rwstat_add(&tg->total_io_queued, bio->bi_opf, 1); throtl_enqueue_tg(tg); } @@ -1216,8 +1304,10 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while ((bio = bio_list_pop(&bio_list_on_stack))) + while ((bio = bio_list_pop(&bio_list_on_stack))) { + bio_set_io_start_time_ns(bio); submit_bio_noacct_nocheck(bio, false); + } blk_finish_plug(&plug); } } @@ -1469,6 +1559,31 @@ static struct cftype throtl_legacy_files[] = { .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, + { + .name = "throttle.io_service_time", + .private = offsetof(struct throtl_grp, service_time), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_wait_time", + .private = offsetof(struct throtl_grp, wait_time), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_completed", + .private = offsetof(struct throtl_grp, completed), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.total_bytes_queued", + .private = offsetof(struct throtl_grp, total_bytes_queued), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.total_io_queued", + .private = offsetof(struct throtl_grp, total_io_queued), + .seq_show = tg_print_rwstat, + }, { } /* terminate */ }; @@ -1517,6 +1632,56 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, return 0; } +static u64 tg_prfill_extstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[10][21] = { "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" }; + struct blkg_rwstat_sample tmp = { }; + + if (!dname) + return 0; + + /* read/write IOs wait time */ + blkg_rwstat_read(&tg->wait_time, &tmp); + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write IOs service time */ + blkg_rwstat_read(&tg->service_time, &tmp); + snprintf(bufs[2], sizeof(bufs[2]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[3], sizeof(bufs[3]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write completed IOs */ + blkg_rwstat_read(&tg->completed, &tmp); + snprintf(bufs[4], sizeof(bufs[4]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[5], sizeof(bufs[5]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write queued bytes */ + blkg_rwstat_read(&tg->total_bytes_queued, &tmp); + snprintf(bufs[6], sizeof(bufs[6]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[7], sizeof(bufs[7]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write queued IOs */ + blkg_rwstat_read(&tg->total_io_queued, &tmp); + snprintf(bufs[8], sizeof(bufs[8]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[9], sizeof(bufs[9]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + + seq_printf(sf, "%s rwait=%s wwait=%s rserv=%s wserv=%s rcomp=%s wcomp=%s " + "rbytesq=%s wbytesq=%s riosq=%s wiosq=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], bufs[4], + bufs[5], bufs[6], bufs[7], bufs[8], bufs[9]); + + return 0; +} + static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, @@ -1524,6 +1689,13 @@ static int tg_print_limit(struct seq_file *sf, void *v) return 0; } +static int tg_print_extstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_extstat, + &blkcg_policy_throtl, 0, false); + return 0; +} + static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1611,6 +1783,10 @@ static struct cftype throtl_files[] = { .seq_show = tg_print_limit, .write = tg_set_limit, }, + { + .name = "extstat", + .seq_show = tg_print_extstat, + }, { } /* terminate */ }; @@ -1654,7 +1830,24 @@ static void tg_flush_bios(struct throtl_grp *tg) static void throtl_pd_offline(struct blkg_policy_data *pd) { - tg_flush_bios(pd_to_tg(pd)); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *parent = blkg->parent; + + tg_flush_bios(tg); + + if (parent) { + blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time, + &tg->service_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, + &tg->wait_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->completed, + &tg->completed); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_bytes_queued, + &tg->total_bytes_queued); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_io_queued, + &tg->total_io_queued); + } } struct blkcg_policy blkcg_policy_throtl = { @@ -1666,6 +1859,7 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, + .pd_reset_stats_fn = throtl_pd_reset, }; void blk_throtl_cancel_bios(struct gendisk *disk) @@ -1739,6 +1933,8 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); + throtl_bio_stats_start(bio, tg); + spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; @@ -1816,6 +2012,8 @@ bool __blk_throtl_bio(struct bio *bio) out_unlock: spin_unlock_irq(&q->queue_lock); + if (!throttled) + bio_set_io_start_time_ns(bio); rcu_read_unlock(); return throttled; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 9d7a42c039a15e34d1c11421a142294ec501c189..8dcc9dca1ef429a79e06f61b0f38b94d85c24254 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -128,6 +128,16 @@ struct throtl_grp { struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; + /* total time spent on lower layer: scheduler, device and others */ + struct blkg_rwstat service_time; + /* total time spent on block throttle */ + struct blkg_rwstat wait_time; + /* total IOs completed */ + struct blkg_rwstat completed; + /* total bytes throttled */ + struct blkg_rwstat total_bytes_queued; + /* total IOs throttled */ + struct blkg_rwstat total_io_queued; }; extern struct blkcg_policy blkcg_policy_throtl; diff --git a/include/linux/bio.h b/include/linux/bio.h index 3f71e3e78ecad72ffccae79e51c42853acd0a0b0..92da13d3fdea14d06508ffce0204d4b5d78ea08f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -241,6 +241,21 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) atomic_set(&bio->__bi_cnt, count); } +static inline bool bio_ext_flagged(struct bio *bio, unsigned int bit) +{ + return (bio->bi_ext_flags & (1U << bit)) != 0; +} + +static inline void bio_set_ext_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_ext_flags |= (1U << bit); +} + +static inline void bio_clear_ext_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_ext_flags &= ~(1U << bit); +} + static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f5005861058ea2effeaad2971b3f14a6858ff510..1158a8dae73bbc919eddb45f15aa720e84e6dc12 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct bio_set; @@ -260,6 +261,12 @@ struct bio { struct blkcg_gq *bi_blkg; /* Time that this bio was issued. */ u64 issue_time_ns; +#ifdef CONFIG_BLK_DEV_THROTTLING + unsigned long long start_time_ns; /* when passed to block throttle */ + unsigned long long io_start_time_ns; /* when no more throttle */ + bio_end_io_t *bi_tg_end_io; + void *bi_tg_private; +#endif #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif @@ -291,6 +298,8 @@ struct bio { struct bio_set *bi_pool; + unsigned long bi_ext_flags; /* extend the bi_flags */ + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -336,6 +345,37 @@ enum { BIO_FLAG_LAST }; +/* + * Extend bio flags should be added in here + */ +#define BIO_THROTL_STATED 0 /* bio already stated */ + +#ifdef CONFIG_BLK_DEV_THROTTLING +static inline void bio_set_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline void bio_set_io_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->io_start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline uint64_t bio_start_time_ns(struct bio *bio) +{ + return bio->start_time_ns; +} + +static inline uint64_t bio_io_start_time_ns(struct bio *bio) +{ + return bio->io_start_time_ns; +} +#endif + typedef __u32 __bitwise blk_mq_req_flags_t; #define REQ_OP_BITS 8