From 68c80d0492070bdea4a25aec75df6b319d48fd97 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 6 Mar 2024 17:51:12 +0800 Subject: [PATCH 1/7] anolis: block-throttle: enable hierarchical throttling even on traditional hierarchy ANBZ: #34471 cherry picked from devel-6.6 commit e6c04cbe004a00a52df27751447144992b613901. ECI may have an use case that configuring each device mapper disk throttling policy just under root blkio cgroup, but actually using them in different containers. Since hierarchical throttling is now only supported on cgroup v2 and ECI uses cgroup v1, so we have to enable hierarchical throttling on cgroup v1. This is ported from redhat 7u, and a year ago Jiufei already ported it to alikernel 4.9 as well. So I think this change should be acceptable. Signed-off-by: Joseph Qi Signed-off-by: Ferry Meng --- block/blk-throttle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 97188a795848..54a59af002ba 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -318,7 +318,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; - if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + + /* Enable hierarchical throttling even on traditional hierarchy */ + if (blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } -- Gitee From 8533736a06704026a77348d5c6bbd3a8bbc96765 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 6 Mar 2024 17:51:21 +0800 Subject: [PATCH 2/7] anolis: blk-throttle: support io delay stats ANBZ: #34471 cherry picked from devel-6.6 commit 472c1f70337e8fb5c87e28837ddefdf4ad6acecc. Add blkio.throttle.io_service_time and blkio.throttle.io_wait_time to get per-cgroup io delay statistics. io_service_time represents the time spent after io throttle to io completion, while io_wait_time represents the time spent on throttle queue. Signed-off-by: Joseph Qi [ initialize bi_ext_flags in bio_init() ] Signed-off-by: Ferry Meng --- block/bio.c | 11 ++++ block/blk-throttle.c | 112 +++++++++++++++++++++++++++++++++++--- block/blk-throttle.h | 4 ++ include/linux/bio.h | 15 +++++ include/linux/blk_types.h | 40 ++++++++++++++ 5 files changed, 174 insertions(+), 8 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4d500beef061..6db6499119cc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -237,6 +237,12 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); +#ifdef CONFIG_BLK_DEV_THROTTLING + bio->start_time_ns = 0; + bio->io_start_time_ns = 0; + bio->bi_tg_end_io = NULL; + bio->bi_tg_private = NULL; +#endif #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif @@ -256,6 +262,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; + bio->bi_ext_flags = 0; } EXPORT_SYMBOL(bio_init); @@ -1792,6 +1799,10 @@ void bio_endio(struct bio *bio) } #endif +#ifdef CONFIG_BLK_DEV_THROTTLING + if (bio->bi_tg_end_io) + bio->bi_tg_end_io(bio); +#endif if (bio->bi_end_io) bio->bi_end_io(bio); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 54a59af002ba..f7bc8b40bd16 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -268,11 +268,11 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, if (!tg) return NULL; - if (blkg_rwstat_init(&tg->stat_bytes, gfp)) - goto err_free_tg; - - if (blkg_rwstat_init(&tg->stat_ios, gfp)) - goto err_exit_stat_bytes; + if (blkg_rwstat_init(&tg->stat_bytes, gfp) || + blkg_rwstat_init(&tg->stat_ios, gfp) || + blkg_rwstat_init(&tg->service_time, gfp) || + blkg_rwstat_init(&tg->wait_time, gfp)) + goto err; throtl_service_queue_init(&tg->service_queue); @@ -289,9 +289,11 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, return &tg->pd; -err_exit_stat_bytes: +err: blkg_rwstat_exit(&tg->stat_bytes); -err_free_tg: + blkg_rwstat_exit(&tg->stat_ios); + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); kfree(tg); return NULL; } @@ -362,9 +364,19 @@ static void throtl_pd_free(struct blkg_policy_data *pd) timer_delete_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); + blkg_rwstat_exit(&tg->service_time); + blkg_rwstat_exit(&tg->wait_time); kfree(tg); } +static void throtl_pd_reset(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + blkg_rwstat_reset(&tg->service_time); + blkg_rwstat_reset(&tg->wait_time); +} + static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -917,6 +929,64 @@ static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) return tg_dispatch_iops_time(tg, bio); } +static void throtl_stats_update_completion(struct throtl_grp *tg, + uint64_t start_time, + uint64_t io_start_time, + int op) +{ + unsigned long flags; + uint64_t now = sched_clock(); + + local_irq_save(flags); + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&tg->service_time, op, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time); + local_irq_restore(flags); +} + +static void throtl_bio_end_io(struct bio *bio) +{ + struct throtl_grp *tg; + + rcu_read_lock(); + /* see comments in throtl_bio_stats_start() */ + if (!bio_ext_flagged(bio, BIO_THROTL_STATED)) + goto out; + + tg = (struct throtl_grp *)bio->bi_tg_private; + if (!tg) + goto out; + + throtl_stats_update_completion(tg, bio_start_time_ns(bio), + bio_io_start_time_ns(bio), + bio_op(bio)); + blkg_put(tg_to_blkg(tg)); + bio_clear_ext_flag(bio, BIO_THROTL_STATED); +out: + rcu_read_unlock(); +} + +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp *tg) +{ + int op = bio_op(bio); + + /* + * It may happen that end_io will be called twice like dm-thin, + * which will save origin end_io first, and call its overwrite + * end_io and then the saved end_io. We use bio flag + * BIO_THROTL_STATED to do only once statistics. + */ + if ((op == REQ_OP_READ || op == REQ_OP_WRITE) && + !bio_ext_flagged(bio, BIO_THROTL_STATED)) { + blkg_get(tg_to_blkg(tg)); + bio_set_ext_flag(bio, BIO_THROTL_STATED); + bio->bi_tg_end_io = throtl_bio_end_io; + bio->bi_tg_private = tg; + bio_set_start_time_ns(bio); + } +} + /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add @@ -1471,6 +1541,16 @@ static struct cftype throtl_legacy_files[] = { .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, + { + .name = "throttle.io_service_time", + .private = offsetof(struct throtl_grp, service_time), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_wait_time", + .private = offsetof(struct throtl_grp, wait_time), + .seq_show = tg_print_rwstat, + }, { } /* terminate */ }; @@ -1656,7 +1736,18 @@ static void tg_flush_bios(struct throtl_grp *tg) static void throtl_pd_offline(struct blkg_policy_data *pd) { - tg_flush_bios(pd_to_tg(pd)); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *parent = blkg->parent; + + tg_flush_bios(tg); + + if (parent) { + blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time, + &tg->service_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, + &tg->wait_time); + } } struct blkcg_policy blkcg_policy_throtl = { @@ -1668,6 +1759,7 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, + .pd_reset_stats_fn = throtl_pd_reset, }; void blk_throtl_cancel_bios(struct gendisk *disk) @@ -1741,6 +1833,8 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); + throtl_bio_stats_start(bio, tg); + spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; @@ -1818,6 +1912,8 @@ bool __blk_throtl_bio(struct bio *bio) out_unlock: spin_unlock_irq(&q->queue_lock); + if (!throttled) + bio_set_io_start_time_ns(bio); rcu_read_unlock(); return throttled; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 9d7a42c039a1..4fe8250785be 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -128,6 +128,10 @@ struct throtl_grp { struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; + /* total time spent on lower layer: scheduler, device and others */ + struct blkg_rwstat service_time; + /* total time spent on block throttle */ + struct blkg_rwstat wait_time; }; extern struct blkcg_policy blkcg_policy_throtl; diff --git a/include/linux/bio.h b/include/linux/bio.h index 3f71e3e78eca..92da13d3fdea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -241,6 +241,21 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) atomic_set(&bio->__bi_cnt, count); } +static inline bool bio_ext_flagged(struct bio *bio, unsigned int bit) +{ + return (bio->bi_ext_flags & (1U << bit)) != 0; +} + +static inline void bio_set_ext_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_ext_flags |= (1U << bit); +} + +static inline void bio_clear_ext_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_ext_flags &= ~(1U << bit); +} + static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f5005861058e..1158a8dae73b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct bio_set; @@ -260,6 +261,12 @@ struct bio { struct blkcg_gq *bi_blkg; /* Time that this bio was issued. */ u64 issue_time_ns; +#ifdef CONFIG_BLK_DEV_THROTTLING + unsigned long long start_time_ns; /* when passed to block throttle */ + unsigned long long io_start_time_ns; /* when no more throttle */ + bio_end_io_t *bi_tg_end_io; + void *bi_tg_private; +#endif #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif @@ -291,6 +298,8 @@ struct bio { struct bio_set *bi_pool; + unsigned long bi_ext_flags; /* extend the bi_flags */ + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -336,6 +345,37 @@ enum { BIO_FLAG_LAST }; +/* + * Extend bio flags should be added in here + */ +#define BIO_THROTL_STATED 0 /* bio already stated */ + +#ifdef CONFIG_BLK_DEV_THROTTLING +static inline void bio_set_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline void bio_set_io_start_time_ns(struct bio *bio) +{ + preempt_disable(); + bio->io_start_time_ns = sched_clock(); + preempt_enable(); +} + +static inline uint64_t bio_start_time_ns(struct bio *bio) +{ + return bio->start_time_ns; +} + +static inline uint64_t bio_io_start_time_ns(struct bio *bio) +{ + return bio->io_start_time_ns; +} +#endif + typedef __u32 __bitwise blk_mq_req_flags_t; #define REQ_OP_BITS 8 -- Gitee From dc5108d2b6d5b6719ef0625e3fb823bd09ab754a Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 6 Mar 2024 17:51:39 +0800 Subject: [PATCH 3/7] anolis: blk-throttle: add throttled io/bytes counter ANBZ: #34471 cherry picked from devel-6.6 commit 7256805e79fdd0db40e234a2b479bef5a7072e51. Add 2 interfaces to stat io throttle information: blkio.throttle.total_io_queued blkio.throttle.total_bytes_queued These interfaces are used for monitoring throttled io/bytes and analyzing if delay has relation with io throttle. Signed-off-by: Joseph Qi Signed-off-by: Ferry Meng --- block/blk-throttle.c | 27 ++++++++++++++++++++++++++- block/blk-throttle.h | 4 ++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f7bc8b40bd16..622113b46211 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -271,7 +271,9 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, if (blkg_rwstat_init(&tg->stat_bytes, gfp) || blkg_rwstat_init(&tg->stat_ios, gfp) || blkg_rwstat_init(&tg->service_time, gfp) || - blkg_rwstat_init(&tg->wait_time, gfp)) + blkg_rwstat_init(&tg->wait_time, gfp) || + blkg_rwstat_init(&tg->total_bytes_queued, gfp) || + blkg_rwstat_init(&tg->total_io_queued, gfp)) goto err; throtl_service_queue_init(&tg->service_queue); @@ -294,6 +296,8 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, blkg_rwstat_exit(&tg->stat_ios); blkg_rwstat_exit(&tg->service_time); blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->total_bytes_queued); + blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); return NULL; } @@ -366,6 +370,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd) blkg_rwstat_exit(&tg->stat_ios); blkg_rwstat_exit(&tg->service_time); blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->total_bytes_queued); + blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); } @@ -375,6 +381,8 @@ static void throtl_pd_reset(struct blkg_policy_data *pd) blkg_rwstat_reset(&tg->service_time); blkg_rwstat_reset(&tg->wait_time); + blkg_rwstat_reset(&tg->total_bytes_queued); + blkg_rwstat_reset(&tg->total_io_queued); } static struct throtl_grp * @@ -1025,6 +1033,9 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, bio == throtl_peek_queued(&sq->queued[rw])) tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; + blkg_rwstat_add(&tg->total_bytes_queued, bio_op(bio), + throtl_bio_data_size(bio)); + blkg_rwstat_add(&tg->total_io_queued, bio_op(bio), 1); throtl_enqueue_tg(tg); } @@ -1551,6 +1562,16 @@ static struct cftype throtl_legacy_files[] = { .private = offsetof(struct throtl_grp, wait_time), .seq_show = tg_print_rwstat, }, + { + .name = "throttle.total_bytes_queued", + .private = offsetof(struct throtl_grp, total_bytes_queued), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.total_io_queued", + .private = offsetof(struct throtl_grp, total_io_queued), + .seq_show = tg_print_rwstat, + }, { } /* terminate */ }; @@ -1747,6 +1768,10 @@ static void throtl_pd_offline(struct blkg_policy_data *pd) &tg->service_time); blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, &tg->wait_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_bytes_queued, + &tg->total_bytes_queued); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_io_queued, + &tg->total_io_queued); } } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 4fe8250785be..2b8bbb3d4b5e 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -132,6 +132,10 @@ struct throtl_grp { struct blkg_rwstat service_time; /* total time spent on block throttle */ struct blkg_rwstat wait_time; + /* total bytes throttled */ + struct blkg_rwstat total_bytes_queued; + /* total IOs throttled */ + struct blkg_rwstat total_io_queued; }; extern struct blkcg_policy blkcg_policy_throtl; -- Gitee From a3e01dd4666ea29e5e0ce70132e1d0ee15940af0 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 6 Mar 2024 17:51:53 +0800 Subject: [PATCH 4/7] anolis: block-throttle: add counters for completed io ANBZ: #34471 cherry picked from devel-6.6 commit 8c33aa0298e854c037fda71aeb88311785d70945. Now we have counters for wait_time and service_time, but no completed ios, so the average latency can not be measured. Signed-off-by: Jiufei Xue Signed-off-by: Joseph Qi Signed-off-by: Ferry Meng --- block/blk-throttle.c | 12 ++++++++++++ block/blk-throttle.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 622113b46211..3f73029d42e4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -272,6 +272,7 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, blkg_rwstat_init(&tg->stat_ios, gfp) || blkg_rwstat_init(&tg->service_time, gfp) || blkg_rwstat_init(&tg->wait_time, gfp) || + blkg_rwstat_init(&tg->completed, gfp) || blkg_rwstat_init(&tg->total_bytes_queued, gfp) || blkg_rwstat_init(&tg->total_io_queued, gfp)) goto err; @@ -296,6 +297,7 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, blkg_rwstat_exit(&tg->stat_ios); blkg_rwstat_exit(&tg->service_time); blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->completed); blkg_rwstat_exit(&tg->total_bytes_queued); blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); @@ -370,6 +372,7 @@ static void throtl_pd_free(struct blkg_policy_data *pd) blkg_rwstat_exit(&tg->stat_ios); blkg_rwstat_exit(&tg->service_time); blkg_rwstat_exit(&tg->wait_time); + blkg_rwstat_exit(&tg->completed); blkg_rwstat_exit(&tg->total_bytes_queued); blkg_rwstat_exit(&tg->total_io_queued); kfree(tg); @@ -381,6 +384,7 @@ static void throtl_pd_reset(struct blkg_policy_data *pd) blkg_rwstat_reset(&tg->service_time); blkg_rwstat_reset(&tg->wait_time); + blkg_rwstat_reset(&tg->completed); blkg_rwstat_reset(&tg->total_bytes_queued); blkg_rwstat_reset(&tg->total_io_queued); } @@ -950,6 +954,7 @@ static void throtl_stats_update_completion(struct throtl_grp *tg, blkg_rwstat_add(&tg->service_time, op, now - io_start_time); if (time_after64(io_start_time, start_time)) blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time); + blkg_rwstat_add(&tg->completed, op, 1); local_irq_restore(flags); } @@ -1562,6 +1567,11 @@ static struct cftype throtl_legacy_files[] = { .private = offsetof(struct throtl_grp, wait_time), .seq_show = tg_print_rwstat, }, + { + .name = "throttle.io_completed", + .private = offsetof(struct throtl_grp, completed), + .seq_show = tg_print_rwstat, + }, { .name = "throttle.total_bytes_queued", .private = offsetof(struct throtl_grp, total_bytes_queued), @@ -1768,6 +1778,8 @@ static void throtl_pd_offline(struct blkg_policy_data *pd) &tg->service_time); blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time, &tg->wait_time); + blkg_rwstat_add_aux(&blkg_to_tg(parent)->completed, + &tg->completed); blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_bytes_queued, &tg->total_bytes_queued); blkg_rwstat_add_aux(&blkg_to_tg(parent)->total_io_queued, diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 2b8bbb3d4b5e..8dcc9dca1ef4 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -132,6 +132,8 @@ struct throtl_grp { struct blkg_rwstat service_time; /* total time spent on block throttle */ struct blkg_rwstat wait_time; + /* total IOs completed */ + struct blkg_rwstat completed; /* total bytes throttled */ struct blkg_rwstat total_bytes_queued; /* total IOs throttled */ -- Gitee From 271a1f105c73e21f80e868803f24bfb0f9cbd1ae Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Wed, 6 Mar 2024 17:52:18 +0800 Subject: [PATCH 5/7] anolis: blk-throttle: add io latency indicators in cgroupV2 ANBZ: #34471 cherry picked from devel-6.6 commit 013b1c893728820d57307feb7b657458fcb97528. Currently we have already supported io_{wait_time/completed/service_time} and total_{bytes_queued/io_queued} counters in cgroupV1 (blkio cgroup). Now we offer the same interface in cgroupV2 (under io cgroup). Integrate all indicators into one file, named "io.exstat". Before you read it in subcgroup, remember to enable "io" in ancestor's "cgroup.subtree_control". Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Signed-off-by: Joseph Qi --- Documentation/admin-guide/cgroup-v2.rst | 24 ++++++++++ block/blk-throttle.c | 61 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8ad0b2781317..13357edd28df 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2012,6 +2012,30 @@ IO Interface Files 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021 + io.extstat + A read-only nested-keyed file. + + Lines are keyed by $MAJ:$MIN device numbers and not ordered. + The following nested keys are defined. + + ======== ============================= + rwait IO read wait time + wwait IO write wait time + rserv IO read service time + wserv IO write service time + rcomp Number of completed read IOs + wcomp Number of completed write IOs + rbytesq Bytes of queued read IOs + wbytesq Bytes of queued write IOs + riosq Number of queued read IOs + wiosq Number of queued write IOs + ======== ============================= + + An example read output follows:: + + 253:16 rwait=0 wwait=3300 rserv=0 wserv=414366321956 rcomp=0 wcomp=12 rbytesq=0 wbytesq=40960000 riosq=0 wiosq=12 + 253:0 rwait=0 wwait=0 rserv=0 wserv=0 rcomp=0 wcomp=0 rbytesq=0 wbytesq=0 riosq=0 wiosq=0 + io.cost.qos A read-write nested-keyed file which exists only on the root cgroup. diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3f73029d42e4..42600149eb23 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1630,6 +1630,56 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, return 0; } +static u64 tg_prfill_extstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[10][21] = { "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" }; + struct blkg_rwstat_sample tmp = { }; + + if (!dname) + return 0; + + /* read/write IOs wait time */ + blkg_rwstat_read(&tg->wait_time, &tmp); + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write IOs service time */ + blkg_rwstat_read(&tg->service_time, &tmp); + snprintf(bufs[2], sizeof(bufs[2]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[3], sizeof(bufs[3]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write completed IOs */ + blkg_rwstat_read(&tg->completed, &tmp); + snprintf(bufs[4], sizeof(bufs[4]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[5], sizeof(bufs[5]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write queued bytes */ + blkg_rwstat_read(&tg->total_bytes_queued, &tmp); + snprintf(bufs[6], sizeof(bufs[6]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[7], sizeof(bufs[7]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + /* read/write queued IOs */ + blkg_rwstat_read(&tg->total_io_queued, &tmp); + snprintf(bufs[8], sizeof(bufs[8]), "%llu", + tmp.cnt[BLKG_RWSTAT_READ]); + snprintf(bufs[9], sizeof(bufs[9]), "%llu", + tmp.cnt[BLKG_RWSTAT_WRITE]); + + seq_printf(sf, "%s rwait=%s wwait=%s rserv=%s wserv=%s rcomp=%s wcomp=%s " + "rbytesq=%s wbytesq=%s riosq=%s wiosq=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], bufs[4], + bufs[5], bufs[6], bufs[7], bufs[8], bufs[9]); + + return 0; +} + static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, @@ -1637,6 +1687,13 @@ static int tg_print_limit(struct seq_file *sf, void *v) return 0; } +static int tg_print_extstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_extstat, + &blkcg_policy_throtl, 0, false); + return 0; +} + static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1724,6 +1781,10 @@ static struct cftype throtl_files[] = { .seq_show = tg_print_limit, .write = tg_set_limit, }, + { + .name = "extstat", + .seq_show = tg_print_extstat, + }, { } /* terminate */ }; -- Gitee From 0fe8c42202b442e5add69d66ca3d542fc2cfd846 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 8 Apr 2026 17:31:14 +0800 Subject: [PATCH 6/7] anolis: blk-throttle: fix io_start_time_ns not set for throttled bios ANBZ: #34471 cherry picked from devel-6.6 commit a6ca4e4560e4df57bc6d04c1eb681a20b8ee7cae. When a bio is throttled and queued in the throttle queue, bio_set_io_start_time_ns() is never called because it is only set in blk_throtl_bio(). But after the bio is dispatched from the throttle queue, it goes through blk_throtl_dispatch_work_fn() -> submit_bio_noacct_nocheck(), which bypasses blk_throtl_bio() entirely. As a result, bio->io_start_time_ns remains 0 for all throttled bios. This causes completely wrong statistics in throtl_stats_update_completion(): - service_time = sched_clock() - 0 = nanoseconds since boot (huge bogus value) - wait_time is never recorded (time_after64(0, start_time) is false) Fix this by calling bio_set_io_start_time_ns() in blk_throtl_dispatch_work_fn() right before the bio is submitted, correctly recording the moment the bio leaves the throttle queue. With this fix: - wait_time = io_start_time_ns - start_time_ns (throttle queue time) - service_time = completion_time - io_start_time_ns (device processing time) Fixes: 25789f9ed2f7 ("ck: blk-throttle: support io delay stats") Signed-off-by: Joseph Qi Reviewed-by: Baokun Li Reviewed-by: Ferry Meng --- block/blk-throttle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 42600149eb23..2eba414081f2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1304,8 +1304,10 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while ((bio = bio_list_pop(&bio_list_on_stack))) + while ((bio = bio_list_pop(&bio_list_on_stack))) { + bio_set_io_start_time_ns(bio); submit_bio_noacct_nocheck(bio, false); + } blk_finish_plug(&plug); } } -- Gitee From ff35beecd7f7f993b88d8a6600357251acf619e5 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 8 Apr 2026 19:15:33 +0800 Subject: [PATCH 7/7] anolis: blk-throttle: pass bi_opf instead of bio_op() to preserve SYNC semantics ANBZ: #34471 cherry picked from devel-6.6 commit a23ee2fbff8d697f067bae1e329845ceb9aa4e6d. blkg_rwstat_add() uses op_is_sync(op) to classify IO into SYNC vs ASYNC buckets. However, the callers in blk-throttle io delay stats pass bio_op(bio) which returns only the operation type (REQ_OP_MASK), stripping all flags including REQ_SYNC. This causes all IO to be incorrectly classified as ASYNC in the following stats: - service_time - wait_time - completed - total_bytes_queued - total_io_queued Fix by passing bio->bi_opf which includes the full operation flags. Also update the throtl_stats_update_completion() parameter type from 'int op' to 'unsigned int opf' to match the bi_opf type and clarify that operation flags are being passed. Fixes: 25789f9ed2f7 ("ck: blk-throttle: support io delay stats") Signed-off-by: Joseph Qi Reviewed-by: Baokun Li Reviewed-by: Ferry Meng --- block/blk-throttle.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2eba414081f2..f148ea4fd449 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -944,17 +944,17 @@ static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) static void throtl_stats_update_completion(struct throtl_grp *tg, uint64_t start_time, uint64_t io_start_time, - int op) + blk_opf_t opf) { unsigned long flags; uint64_t now = sched_clock(); local_irq_save(flags); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&tg->service_time, op, now - io_start_time); + blkg_rwstat_add(&tg->service_time, opf, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time); - blkg_rwstat_add(&tg->completed, op, 1); + blkg_rwstat_add(&tg->wait_time, opf, io_start_time - start_time); + blkg_rwstat_add(&tg->completed, opf, 1); local_irq_restore(flags); } @@ -973,7 +973,7 @@ static void throtl_bio_end_io(struct bio *bio) throtl_stats_update_completion(tg, bio_start_time_ns(bio), bio_io_start_time_ns(bio), - bio_op(bio)); + bio->bi_opf); blkg_put(tg_to_blkg(tg)); bio_clear_ext_flag(bio, BIO_THROTL_STATED); out: @@ -1038,9 +1038,9 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, bio == throtl_peek_queued(&sq->queued[rw])) tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; - blkg_rwstat_add(&tg->total_bytes_queued, bio_op(bio), + blkg_rwstat_add(&tg->total_bytes_queued, bio->bi_opf, throtl_bio_data_size(bio)); - blkg_rwstat_add(&tg->total_io_queued, bio_op(bio), 1); + blkg_rwstat_add(&tg->total_io_queued, bio->bi_opf, 1); throtl_enqueue_tg(tg); } -- Gitee