diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 4de42999f905a849de7f1eca4874c13ee56d4f6d..71e12eb64467e6792a7ad44f74ed61836c016448 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -822,6 +822,7 @@ CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set # # Partition Types diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 928b4379af4d0c59ef3d76c6cbcc4c984dafaf25..7993f0f3e7a42a4da7fd5a5af5258f793f928e52 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -891,6 +891,7 @@ CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set # # Partition Types diff --git a/block/Kconfig b/block/Kconfig index da71e56f8682c029cd7d4b796d19b63fc9fd59a9..8804f21df1519c6bd5ad9678e99cfe98f26b36a7 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,12 @@ menuconfig BLOCK if BLOCK +config BLK_BIO_ALLOC_TIME + bool + +config BLK_BIO_ALLOC_TASK + bool + config LBDAF bool "Support for large (2TB+) block devices and files" depends on !64BIT @@ -213,6 +219,8 @@ config BLK_BIO_DISPATCH_ASYNC feature will require special care in the driver to work. If unsure, say N here. +source "block/blk-io-hierarchy/Kconfig" + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 572b33f32c07cf7056fb1121abba753a9ba8a0ac..bb711b0c307a6c45c84a8d6685b7b9dc790e73da 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/ diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b663cd8b9e46f68d6854119f2a960f4d4213dcba..25a407e5142dbf8ba8fd7848a197cadad020fb1a 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -22,6 +22,7 @@ #include #include +#include "blk.h" #include "bfq-iosched.h" #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) @@ -60,7 +61,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) blkg_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -77,7 +78,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } @@ -89,7 +90,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) blkg_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -116,7 +117,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } @@ -125,7 +126,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) blkg_stat_add(&stats->idle_time, @@ -138,7 +139,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } @@ -175,7 +176,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, unsigned int op) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, op, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 473d9e31ff8771793ab1baa07422efc719882c1c..5e94c8cf76b506dd9cf557a8c9c052b649a516ce 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -140,6 +140,7 @@ #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" #define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -844,7 +845,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, rq = rq_entry_fifo(bfqq->fifo.next); - if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1566,7 +1567,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; @@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock_irq(&bfqd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_BFQ); blk_mq_free_request(free); + } return ret; } @@ -2468,7 +2471,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get(); bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -2568,7 +2571,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); - bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), HRTIMER_MODE_REL); bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); @@ -2605,7 +2608,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -2762,7 +2765,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -3099,7 +3102,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); @@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled); + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_BFQ); return rq; } @@ -4410,7 +4415,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = blk_time_get_ns() + 1; bfq_mark_bfqq_IO_bound(bfqq); @@ -4528,7 +4533,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); @@ -4697,7 +4702,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -4750,6 +4756,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ); blk_mq_free_requests(&free); return; } @@ -4797,6 +4804,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { + rq_list_hierarchy_start_io_acct(list, STAGE_BFQ); while (!list_empty(list)) { struct request *rq; @@ -4853,7 +4861,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqd, bfqq); } - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); bfqq->ttime.last_end_request = now_ns; @@ -5394,6 +5402,7 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_queue *bfqq, *n; struct request_queue *q = bfqd->queue; + blk_mq_unregister_hierarchy(q, STAGE_BFQ); hrtimer_cancel(&bfqd->idle_slice_timer); spin_lock_irq(&bfqd->lock); @@ -5560,6 +5569,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); wbt_disable_default(q); + blk_mq_register_hierarchy(q, STAGE_BFQ); return 0; out_free: diff --git a/block/bio.c b/block/bio.c index 06193e854577e1ecff96d66b4f382084ceedd2a5..ff18f6839063016bb5c2d0419c0400b4235afe64 100644 --- a/block/bio.c +++ b/block/bio.c @@ -33,6 +33,7 @@ #include #include "blk.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -245,6 +246,14 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, void bio_uninit(struct bio *bio) { bio_disassociate_task(bio); +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + if (bio->pid) { + put_pid(bio->pid); + bio->pid = NULL; + } +#endif + bio_hierarchy_end(bio); + bio_free_hierarchy_data(bio); } EXPORT_SYMBOL(bio_uninit); @@ -285,6 +294,14 @@ void bio_init(struct bio *bio, struct bio_vec *table, bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + bio->bi_alloc_time_ns = blk_time_get_ns(); +#endif + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + bio->pid = get_pid(task_pid(current)); +#endif } EXPORT_SYMBOL(bio_init); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d4a8d8fbe1a0ebe443515c80ccb060ab0d0e8a9e..c0187bf00f714c63671d0459d7d4125f54ec4b26 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1729,7 +1729,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; diff --git a/block/blk-core.c b/block/blk-core.c index acf5585b055766d1e9c9b557c8550dd1987bb88c..0c74101424dc186b999de4db90f7c306813b7d03 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -43,6 +43,7 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -454,7 +455,7 @@ void __blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = -1; rq->internal_tag = -1; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; } @@ -537,8 +538,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) { + req_bio_hierarchy_end(rq, bio); bio_endio(bio); + } } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -1001,6 +1004,15 @@ void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); } +static void blk_mq_unregister_default_hierarchy(struct request_queue *q) +{ + blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_hierarchy(q, STAGE_PLUG); + blk_mq_unregister_hierarchy(q, STAGE_HCTX); + blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); + blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -1088,6 +1100,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q); if (q->mq_ops) { + blk_mq_unregister_default_hierarchy(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } @@ -2106,6 +2119,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL); blk_account_io_start(req, false); return true; @@ -2129,6 +2143,7 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL); blk_account_io_start(req, false); return true; @@ -2149,6 +2164,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->nr_phys_segments = segments + 1; + blk_rq_update_bi_alloc_time(req, bio, NULL); blk_account_io_start(req, false); return true; @@ -2613,6 +2629,12 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); + /* + * On the one hand REQ_PREFLUSH | REQ_FUA can be cleared above, on the + * other hand it doesn't make sense to count invalid bio. Split bio will + * be accounted separately. + */ + bio_hierarchy_start(bio); if (!blkcg_bio_issue_check(q, bio)) return false; @@ -2952,7 +2974,7 @@ blk_status_t __blk_insert_cloned_request(struct request_queue *q, u64 now = 0; if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + now = blk_time_get_ns(); blk_account_io_done(rq, now); } @@ -3304,7 +3326,7 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - req->io_start_time_ns = ktime_get_ns(); + req->io_start_time_ns = blk_time_get_ns(); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW req->throtl_size = blk_rq_sectors(req); #endif @@ -3509,7 +3531,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); lockdep_assert_held(req->q->queue_lock); WARN_ON_ONCE(q->mq_ops); @@ -3727,6 +3749,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; + blk_rq_update_bi_alloc_time(rq, bio, NULL); if (bio->bi_disk) rq->rq_disk = bio->bi_disk; @@ -3923,6 +3946,7 @@ void blk_start_plug(struct blk_plug *plug) * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ + tsk->_resvd->cur_ktime = 0; tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -4060,6 +4084,9 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (q) queue_unplugged(q, depth, from_schedule); + + current->_resvd->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; } void blk_finish_plug(struct blk_plug *plug) diff --git a/block/blk-flush.c b/block/blk-flush.c index c1bfcde165af5ae993faa73d890bc8fb06ad9b54..e788e5513c9e0d8729fd724c57bf972214fac701 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -75,6 +75,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h" /* PREFLUSH/FUA sequences */ enum { @@ -187,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq, if (list_empty(pending)) fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); break; case REQ_FSEQ_DATA: @@ -245,6 +247,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + blk_mq_put_alloc_task(flush_rq); + blk_rq_hierarchy_stats_complete(flush_rq); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; @@ -274,6 +278,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + rq_hierarchy_end_io_acct(rq, STAGE_HCTX); queued |= blk_flush_complete_seq(rq, fq, seq, error); } @@ -378,6 +383,11 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + blk_rq_hierarchy_stats_init(flush_rq); + blk_rq_init_bi_alloc_time(flush_rq, first_rq); + if (q->mq_ops) + blk_mq_get_alloc_task(flush_rq, first_rq->bio); + /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from @@ -448,6 +458,8 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) blk_mq_put_driver_tag_hctx(hctx, rq); } + blk_rq_hierarchy_set_flush_done(rq); + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). @@ -601,7 +613,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; - int rq_sz = sizeof(struct request); + struct request_wrapper *wrapper; + int rq_sz = sizeof(struct request) + sizeof(struct request_wrapper); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) @@ -611,10 +624,11 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); - fq->flush_rq = kzalloc_node(rq_sz, flags, node); - if (!fq->flush_rq) + wrapper = kzalloc_node(rq_sz, flags, node); + if (!wrapper) goto fail_rq; + fq->flush_rq = (struct request *)(wrapper + 1); INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); @@ -633,6 +647,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - kfree(fq->flush_rq); + kfree(request_to_wrapper(fq->flush_rq)); kfree(fq); } diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..01019f6aa4252cb9fe3c6c85e5a8d49c96f21443 --- /dev/null +++ b/block/blk-io-hierarchy/Kconfig @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig BLK_IO_HIERARCHY_STATS + bool "Enable hierarchy io stats" + default n + depends on BLK_DEBUG_FS=y + help + Enabling this lets the block layer to record additional information + in different io stages. Such information can be helpful to debug + performance and problems like io hang. + + If unsure, say N. + +if BLK_IO_HIERARCHY_STATS + +config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance and + problems like io hang. + + If unsure, say N. + +config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + +config HIERARCHY_THROTTLE + bool "Enable hierarchy stats layer blk-throttle" + default n + depends on BLK_DEV_THROTTLING=y + help + Enabling this lets blk hierarchy stats to record additional information + for blk-throttle. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_WBT + bool "Enable hierarchy stats layer blk-wbt" + default n + depends on BLK_WBT + help + Enabling this lets blk hierarchy stats to record additional information + for blk-wbt. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_GETTAG + bool "Enable hierarchy stats layer gettag" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for gettag. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_PLUG + bool "Enable hierarchy stats layer plug" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for plug. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_DEADLINE + bool "Enable hierarchy stats layer mq-deadline" + default n + depends on MQ_IOSCHED_DEADLINE + help + Enabling this lets blk hierarchy stats to record additional information + for mq-deadline. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_BFQ + bool "Enable hierarchy stats layer bfq" + default n + depends on IOSCHED_BFQ + help + Enabling this lets blk hierarchy stats to record additional information + for bfq. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_KYBER + bool "Enable hierarchy stats layer kyber" + default n + depends on MQ_IOSCHED_KYBER + help + Enabling this lets blk hierarchy stats to record additional information + for kyber. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_HCTX + bool "Enable hierarchy stats layer hctx" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for hctx. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_REQUEUE + bool "Enable hierarchy stats layer requeue" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_RQ_DRIVER + bool "Enable hierarchy stats layer rq_driver" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for rq_driver. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +endif diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9b989d379e5807a5827c62cf62269ec168084df8 --- /dev/null +++ b/block/blk-io-hierarchy/Makefile @@ -0,0 +1,8 @@ +# +# Make file for blk_io_hierarchy_stats +# + +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o + +blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c new file mode 100644 index 0000000000000000000000000000000000000000..29c17e116773b11b2804fbf36e679dc6c69153d3 --- /dev/null +++ b/block/blk-io-hierarchy/debugfs.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +#include "../blk-mq-debugfs.h" +#include "stats.h" +#include "iodump.h" + +static const char *stage_name[NR_STAGE_GROUPS] = { +#ifdef CONFIG_HIERARCHY_THROTTLE + [STAGE_THROTTLE] = "throtl", +#endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif +}; + +const char *hierarchy_stage_name(enum stage_group stage) +{ + return stage_name[stage]; +} + +static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data, + struct seq_file *m, enum stage_group stage) +{ + u64 dispatched[NR_NEW_STAT_GROUPS] = {0}; + u64 completed[NR_NEW_STAT_GROUPS] = {0}; + u64 latency[NR_NEW_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) { + dispatched[i] += stat->dispatched[i]; + completed[i] += stat->completed[i]; + latency[i] += stage_is_rq(stage) ? + stat->jiffies[i] : stat->nsecs[i]; + } + } + + if (stage_is_rq(stage)) + for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + dispatched[STAT_READ], completed[STAT_READ], + latency[STAT_READ], dispatched[STAT_WRITE], + completed[STAT_WRITE], latency[STAT_WRITE], + dispatched[STAT_DISCARD], completed[STAT_DISCARD], + latency[STAT_DISCARD], dispatched[STAT_FLUSH], + completed[STAT_FLUSH], latency[STAT_FLUSH]); + + hierarchy_show_slow_io(hstats_data, m); + seq_putc(m, '\n'); + return 0; +} + +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + enum stage_group stage = *pos; + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stats_data *hstats_data = + get_hstats_data(stats, stage); + + if (!hstats_data) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstats_data, m, stage); + put_hstats_data(stats, hstats_data); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static int hierarchy_stats_show_single(void *v, struct seq_file *m) +{ + struct hierarchy_stage *hstage = v; + + return __hierarchy_stats_show(hstage->hstats_data, m, hstage->stage); +} + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, hierarchy_stats_show_single}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, + {}, +}; + +static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + struct dentry *dir; + + if (!stage_name[stage] || hstage->debugfs_dir) + return; + + dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir); + if (IS_ERR(dir)) + return; + + hstage->debugfs_dir = dir; + debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); +} + +static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!stage_name[stage] || !hstage->debugfs_dir) + return; + + debugfs_remove_recursive(hstage->debugfs_dir); + hstage->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_register_stage(stats, stage); +} + +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_unregister_stage(stats, stage); +} + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 0000000000000000000000000000000000000000..18bd813665f8d53bc2eecd7c6c5bf769fa24f68b --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool has_elevator; + bool enter_queue; +}; + +#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + +#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = + queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->kobj.parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&data->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&data->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ + hdata->bio = bio; + INIT_LIST_HEAD(&hdata->hierarchy_list); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct bio *bio = data->bio; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size); +#ifdef CONFIG_BLK_CGROUP + cgroup_path(bio->bi_css->cgroup, path, PATH_LEN); +#endif + + seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_disk->disk_name, hierarchy_stage_name(stage), + rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path, + duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, data->time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, data, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio_hierarchy_data *data = + list_entry(v, struct bio_hierarchy_data, hierarchy_list); + + hierarchy_show_bio(m, data); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * + q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct request_wrapper *rq_wrapper; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + rq = hctx->fq->flush_rq; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + rq_wrapper = request_to_wrapper(rq); + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq_wrapper->stage) != rq_ddata->stage)) + return NULL; + + if (!refcount_inc_not_zero(&rq->ref)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + if (rq_ddata->stage == STAGE_BIO) + return rq; + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq_wrapper->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + struct task_struct *task = get_pid_task(rq_wrapper->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq_wrapper->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request_wrapper *rq_wrapper; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + rq_wrapper = request_to_wrapper(rq); + htime = READ_ONCE(rq_wrapper->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio_hierarchy_data *data = list_entry( + v, struct bio_hierarchy_data, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + data->bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, data, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + request_to_wrapper(rq)->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats_data->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ + u64 slow[NR_NEW_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 0000000000000000000000000000000000000000..f8ef0d8669f621bf2c97728ed5c4d7fa676d7e66 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +static inline void +hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c new file mode 100644 index 0000000000000000000000000000000000000000..b9e79b43514951945526fe20e3064c8ce5ef05e5 --- /dev/null +++ b/block/blk-io-hierarchy/stats.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +#include "stats.h" +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define io_hierarchy_add(statsp, field, group, nr) \ + this_cpu_add((statsp)->hstats->field[group], nr) +#define io_hierarchy_inc(statsp, field, group) \ + io_hierarchy_add(statsp, field, group, 1) + +#define PRE_ALLOC_BIO_CNT 8 + +static mempool_t *hdata_pool; + +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = queue_to_wrapper(q)->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", + q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q); + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_register_hierarchy(q, stage); +} + +static void bio_alloc_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) { + struct bio_hierarchy_data *hdata = + mempool_alloc(hdata_pool, GFP_NOIO); + + bio_hierarchy_data_init(bio, hdata); + bio->hdata = hdata; + } +} + +void bio_free_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) + return; + + mempool_free(bio->hdata, hdata_pool); + bio->hdata = NULL; +} + +int blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + + if (!q->mq_ops) + return 0; + + stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL); + if (!stats) + return -ENOMEM; + + spin_lock_init(&stats->hstage_lock); + stats->q = q; + queue_to_wrapper(q)->io_hierarchy_stats = stats; + + return 0; +} + +void blk_io_hierarchy_stats_free(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!stats) + return; + + queue_to_wrapper(q)->io_hierarchy_stats = NULL; + kfree(stats); +} + +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!stats) + return false; + + return stats->hstage[stage] != NULL; +} +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +static struct hierarchy_stats_data *alloc_hstats_data(void) +{ + struct hierarchy_stats_data *hstats_data; + + hstats_data = kmalloc(sizeof(*hstats_data), GFP_KERNEL); + if (!hstats_data) + return NULL; + + hstats_data->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstats_data->hstats) { + kfree(hstats_data); + return NULL; + } + + hstats_data->ref = 1; + return hstats_data; +} + +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage; + struct hierarchy_stats_data *hstats_data = NULL; + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + if (hstage) { + hstats_data = hstage->hstats_data; + if (hstats_data) + hstats_data->ref++; + } + spin_unlock(&stats->hstage_lock); + + return hstats_data; +} + +static void __put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + if (--hstats_data->ref == 0) { + free_percpu(hstats_data->hstats); + kfree(hstats_data); + } +} + +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + spin_lock(&stats->hstage_lock); + __put_hstats_data(stats, hstats_data); + spin_unlock(&stats->hstage_lock); +} + +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!stats || !hierarchy_stage_name(stage)) + return; + + if (blk_mq_hierarchy_registered(q, stage)) { + pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", + kobject_name(q->kobj.parent), + hierarchy_stage_name(stage)); + return; + } + + /* + * Alloc memory before freeze queue, prevent deadlock if new IO is + * issued by memory reclaim. + */ + hstage = kmalloc(sizeof(*hstage), GFP_KERNEL); + if (!hstage) + return; + + hstage->hstats_data = alloc_hstats_data(); + if (!hstage->hstats_data) { + kfree(hstage); + return; + } + + hstage->stage = stage; + hstage->unbalanced_warned = false; + hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + put_hstats_data(stats, hstage->hstats_data); + kfree(hstage); + return; + } + + blk_mq_freeze_queue(q); + + WRITE_ONCE(stats->hstage[stage], hstage); + blk_mq_debugfs_register_hierarchy(q, stage); + + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy); + +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage); + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + stats->hstage[stage] = NULL; + __put_hstats_data(stats, hstage->hstats_data); + spin_unlock(&stats->hstage_lock); + + kfree(hstage); +} +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy); + +static enum stat_group bio_hierarchy_op(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return STAT_DISCARD; + + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || (bio->bi_opf & REQ_HAS_DATA))) + return STAT_FLUSH; + + if (op_is_write(bio->bi_opf)) + return STAT_WRITE; + + return STAT_READ; +} + + +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + bio_alloc_hierarchy_data(bio); + io_hierarchy_inc(hstage->hstats_data, dispatched, + bio_hierarchy_op(bio)); + bio->hdata->time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); +} + +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + op = bio_hierarchy_op(bio); + duration = time - bio->hdata->time; + hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + + hierarchy_remove_bio(hstage, bio); + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; +} + +static void rq_hierarchy_warn_unbalanced(struct request *rq, + struct hierarchy_stage *hstage, + enum stage_group old_stage, + enum stage_group new_stage) +{ + if (hstage->unbalanced_warned) + return; + + pr_warn("blk-io-hierarchy: disk %s stage %d(%s) -> %d(%s) unbalanced accounting.", + kobject_name(rq->q->kobj.parent), + old_stage, hierarchy_stage_name(old_stage), + new_stage, hierarchy_stage_name(new_stage)); + hstage->unbalanced_warned = true; +} + +void blk_rq_hierarchy_stats_complete(struct request *rq) +{ + struct hierarchy_stage *hstage; + enum stage_group stage; + + stage = request_to_wrapper(rq)->stage; + if (stage == NR_RQ_STAGE_GROUPS) + return; + + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + rq_hierarchy_warn_unbalanced(rq, hstage, stage, NR_RQ_STAGE_GROUPS); + __rq_hierarchy_end_io_acct(rq, hstage); +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + + blk_rq_hierarchy_stats_complete(rq); + io_hierarchy_inc(hstage->hstats_data, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq_wrapper->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq_wrapper->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + enum stat_group op; + unsigned long duration; + struct request_wrapper *rq_wrapper; + + rq_wrapper = request_to_wrapper(rq); + if (rq_wrapper->stage != hstage->stage) { + rq_hierarchy_warn_unbalanced(rq, hstage, rq_wrapper->stage, + hstage->stage); + return; + } + + op = rq_hierarchy_op(rq); + duration = jiffies - rq_wrapper->hierarchy_time; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); + WRITE_ONCE(rq_wrapper->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio->bi_opf |= REQ_HAS_DATA; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats_data, dispatched, + bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + enum stat_group op; + u64 duration; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio->bi_opf &= ~REQ_HAS_DATA; +} +#endif + +static int __init hierarchy_stats_init(void) +{ + hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, + sizeof(struct bio_hierarchy_data)); + if (!hdata_pool) + panic("Failed to create hdata_pool\n"); + + return 0; +} +module_init(hierarchy_stats_init); diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..d3c6a26dfacbd344022f5f062aaf64562a845f75 --- /dev/null +++ b/block/blk-io-hierarchy/stats.h @@ -0,0 +1,366 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_STATS_H +#define BLK_IO_HIERARCHY_STATS_H + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + +#include +#include +#include "../blk.h" + +struct bio_hierarchy_data { + u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif +}; + +struct hierarchy_stats { + union { + /* for bio based stages. */ + u64 nsecs[NR_NEW_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NR_NEW_STAT_GROUPS]; + }; + unsigned long dispatched[NR_NEW_STAT_GROUPS]; + unsigned long completed[NR_NEW_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NR_NEW_STAT_GROUPS]; +#endif +}; + +struct hierarchy_stats_data { + int ref; + struct hierarchy_stats __percpu *hstats; +}; + +struct hierarchy_stage { + enum stage_group stage; + bool unbalanced_warned; + struct dentry *debugfs_dir; + struct hierarchy_stats_data *hstats_data; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif +}; + +struct blk_io_hierarchy_stats { + struct request_queue *q; + struct dentry *debugfs_dir; + spinlock_t hstage_lock; + struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; +}; + +static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + +const char *hierarchy_stage_name(enum stage_group stage); +int blk_io_hierarchy_stats_alloc(struct request_queue *q); +void blk_io_hierarchy_stats_free(struct request_queue *q); + +/* APIs for stage registration */ +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage); +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage); +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for disk level debugfs */ +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q); + +/* APIs for stage level debugfs */ +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage); +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage); +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data); + +/* APIs for bio based stage io accounting */ +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); +void bio_free_hierarchy_data(struct bio *bio); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void blk_rq_hierarchy_stats_complete(struct request *rq); +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct(rq, + queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct(rq, + queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +static inline void blk_rq_hierarchy_stats_init(struct request *rq) +{ + request_to_wrapper(rq)->stage = NR_RQ_STAGE_GROUPS; + request_to_wrapper(rq)->flush_done = false; +} + +static inline void blk_rq_hierarchy_set_flush_done(struct request *rq) +{ + request_to_wrapper(rq)->flush_done = true; +} + +static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return request_to_wrapper(rq)->flush_done; +} + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + now = request_to_wrapper(rq)->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + request_to_wrapper(rq)->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + +#else /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +static inline int +blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + return 0; +} + +static inline void +blk_io_hierarchy_stats_free(struct request_queue *q) +{ +} + +static inline bool +blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage) +{ + return false; +} + +static inline void +blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) +{ +} + +static inline void +bio_free_hierarchy_data(struct bio *bio) +{ +} + +static inline void +blk_rq_hierarchy_set_flush_done(struct request *rq) +{ +} + +static inline bool +blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return false; +} + +static inline void +blk_rq_hierarchy_stats_complete(struct request *rq) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +blk_rq_hierarchy_stats_init(struct request *rq) +{ +} + +#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + +#endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 1baa3c49e2e3b8386b900b9f18f9469d7cfc45d5..6f81794eb6e6d6cb0790ba7e13b1f201e13d7284 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -557,7 +557,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); bool issue_as_root = bio_issue_as_root_blkg(bio); bool enabled = false; int inflight = 0; @@ -624,7 +624,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -895,7 +895,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = blkcg_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu; for_each_possible_cpu(cpu) { diff --git a/block/blk-merge.c b/block/blk-merge.c index d2fabe1fdf3264d9cf8bb47888da7a30a6a8ccb6..9f9d803e064b05b81abdc8052e2253dd65e973a5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -784,6 +784,7 @@ static struct request *attempt_merge(struct request_queue *q, req->biotail = next->biotail; req->__data_len += blk_rq_bytes(next); + blk_rq_update_bi_alloc_time(req, NULL, next); if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0865b6ea1e19fb723246eedcdc5fdcba5607303..5ee91901d9ccc1f3b9da7dd6bcaa12e38c6fb833 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h" static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -355,9 +356,13 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; } -int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +/* + * This helper will dump general information for @rq into @m, started with '{' + * and doesn't end with '}', caller must include a closing curly brace '}' at + * the end after adding the custom string. + */ +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = rq->cmd_flags & REQ_OP_MASK; seq_printf(m, "%p {.op=", rq); @@ -374,6 +379,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); @@ -811,8 +823,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static bool debugfs_create_files(struct dentry *parent, void *data, - const struct blk_mq_debugfs_attr *attr) +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return false; @@ -861,6 +873,7 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; } + blk_mq_debugfs_register_hierarchy_stats(q); return 0; err: diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be05a527c0d968a6534e4a244f989e1..70549712b0a270d7774a67f8e03f701a08966e10 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -15,6 +15,7 @@ struct blk_mq_debugfs_attr { const struct seq_operations *seq_ops; }; +void debugfs_rq_show(struct seq_file *m, struct request *rq); int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); @@ -31,6 +32,15 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); + +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr); + +static inline bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + return !IS_ERR_OR_NULL(q->debugfs_dir); +} + #else static inline int blk_mq_debugfs_register(struct request_queue *q) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0fb33abac3f62fe86c28d913bdcf1bb7ecfd45ea..1c8befbe7b69842de0bcd2629100ebaa52189c6f 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -15,6 +15,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)) @@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); @@ -389,10 +391,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); bool ret = false; - if (e && e->type->ops.mq.bio_merge) { - blk_mq_put_ctx(ctx); + if (e && e->type->ops.mq.bio_merge) return e->type->ops.mq.bio_merge(hctx, bio); - } if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_list)) { @@ -402,7 +402,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) spin_unlock(&ctx->lock); } - blk_mq_put_ctx(ctx); return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bee92ab06a5e33aab4039a63efebf943b1288a32..f7b21d7f136e77d2f6620c4a04dc469b4b61fbf3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h" bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { @@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; - bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -135,8 +135,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_TAG_FAIL; + if (data->bio) + bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG); ws = bt_wait_ptr(bt, data->hctx); - drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev; @@ -162,9 +163,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - if (data->ctx) - blk_mq_put_ctx(data->ctx); - bt_prev = bt; io_schedule(); @@ -189,10 +187,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1); - if (drop_ctx && data->ctx) - blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + if (data->bio) + bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG); found_tag: return tag + tag_offset; diff --git a/block/blk-mq.c b/block/blk-mq.c index aa4b3c6082496bbe476e12c2fe16b6063e0c1bb3..8502e7495d58672477fd9e3bdfcdf0d07b12881b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -36,6 +36,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -366,8 +367,13 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); + blk_rq_init_bi_alloc_time(rq, NULL); + blk_mq_get_alloc_task(rq, data->bio); + blk_rq_hierarchy_stats_init(rq); + rq->io_start_time_ns = 0; + request_to_wrapper(rq)->io_end_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -400,13 +406,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - bool put_ctx_on_error = false; + bool clear_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; + clear_ctx_on_error = true; } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); @@ -430,10 +436,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); + if (clear_ctx_on_error) data->ctx = NULL; - } blk_queue_exit(q); return NULL; } @@ -470,8 +474,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (!rq) return ERR_PTR(-EWOULDBLOCK); - blk_mq_put_ctx(alloc_data.ctx); - rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -532,6 +534,8 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); const int sched_tag = rq->internal_tag; + blk_rq_hierarchy_stats_complete(rq); + blk_mq_put_alloc_task(rq); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -576,13 +580,22 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = request_to_wrapper(rq)->io_end_time_ns; + + if (!now) + now = blk_time_get_ns(); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } + /* + * Avoid accounting flush request with data twice and request that is + * not started. + */ + if (blk_mq_request_started(rq) && !blk_rq_hierarchy_is_flush_done(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); blk_account_io_done(rq, now); if (rq->end_io) { @@ -722,9 +735,10 @@ void blk_mq_start_request(struct request *rq) blk_mq_sched_started_request(rq); trace_block_rq_issue(q, rq); + rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW rq->throtl_size = blk_rq_sectors(rq); #endif @@ -760,6 +774,7 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -787,6 +802,7 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); spin_unlock_irq(&q->requeue_lock); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) @@ -826,6 +842,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); + rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { rq->rq_flags |= RQF_SOFTBARRIER; @@ -1317,6 +1334,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart; + rq_list_hierarchy_start_io_acct(list, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1726,6 +1744,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); spin_lock(&hctx->lock); if (at_head) list_add(&rq->queuelist, &hctx->dispatch); @@ -1792,6 +1811,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (rq->mq_ctx != this_ctx) { if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, + STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1812,6 +1833,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } @@ -1975,7 +1997,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { + .flags = 0, + .bio = bio + }; struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; @@ -1991,6 +2016,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio); + /* account for split bio. */ + bio_hierarchy_start(bio); + if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; @@ -2019,7 +2047,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); /* bypass scheduler for flush rq */ @@ -2028,7 +2055,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); /* @@ -2051,6 +2077,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -2066,23 +2093,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) same_queue_rq = NULL; if (same_queue_rq) list_del_init(&same_queue_rq->queuelist); + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - if (same_queue_rq) { data.hctx = blk_mq_map_queue(q, same_queue_rq->mq_ctx->cpu); + rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true); } @@ -2240,7 +2265,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ - rq_size = round_up(sizeof(struct request) + set->cmd_size, + rq_size = round_up(sizeof(struct request) + + sizeof(struct request_wrapper) + set->cmd_size, cache_line_size()); left = rq_size * depth; @@ -2281,7 +2307,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - struct request *rq = p; + struct request *rq = p + sizeof(struct request_wrapper); tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { @@ -2324,6 +2350,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (list_empty(&tmp)) return 0; + rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -2758,6 +2785,9 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i; + blk_mq_unregister_hierarchy(q, STAGE_BIO); + blk_io_hierarchy_stats_free(q); + queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -2895,14 +2925,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + if (blk_io_hierarchy_stats_alloc(q)) + goto err_exit; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) - goto err_exit; + goto err_hierarchy_exit; if (blk_mq_alloc_ctxs(q)) - goto err_exit; + goto err_hierarchy_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -2972,6 +3005,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); +err_hierarchy_exit: + blk_io_hierarchy_stats_free(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); diff --git a/block/blk-mq.h b/block/blk-mq.h index c6ec9aa12fb2a11f488a0d2dfb9e532bea3b4482..80ad4bc91fa283a634172004a3219872a8b2921f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,6 +36,26 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +struct request_wrapper { + u64 io_end_time_ns; +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + u64 bi_alloc_time_ns; +#endif +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + struct pid *pid; +#endif +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + bool flush_done; + enum stage_group stage; + unsigned long hierarchy_time; +#endif +} ____cacheline_aligned_in_smp; + +static inline struct request_wrapper *request_to_wrapper(void *rq) +{ + return rq - sizeof(struct request_wrapper); +} + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -125,12 +145,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { - return __blk_mq_get_ctx(q, get_cpu()); -} - -static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); + return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { @@ -142,6 +157,10 @@ struct blk_mq_alloc_data { /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; + +#ifndef __GENKSYMS__ + struct bio *bio; +#endif }; static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) @@ -234,4 +253,29 @@ static inline void blk_mq_free_requests(struct list_head *list) } } +#ifdef CONFIG_BLK_BIO_ALLOC_TASK +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ + request_to_wrapper(rq)->pid = bio ? get_pid(bio->pid) : + get_pid(task_pid(current)); +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + + if (rq_wrapper->pid) { + put_pid(rq_wrapper->pid); + rq_wrapper->pid = NULL; + } +} +#else +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ +} +static inline void blk_mq_put_alloc_task(struct request *rq) +{ +} +#endif + #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1c4d795bbdc47df56b39a71aa4a20640f1e1a2fb..719687a394eaf6afa7fd32ab664deb1df892b82a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" struct queue_sysfs_entry { struct attribute attr; @@ -924,6 +925,19 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_mq_register_default_hierarchy(struct request_queue *q) +{ + if (!q->mq_ops) + return; + + blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_hierarchy(q, STAGE_PLUG); + blk_mq_register_hierarchy(q, STAGE_HCTX); + blk_mq_register_hierarchy(q, STAGE_REQUEUE); + blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -973,6 +987,8 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; } + blk_mq_register_default_hierarchy(q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5981912865572f455f63750d009473f0a0a372fc..a1867a2f4f181c164a163b75049f402fef6ae0ad 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include #include #include "blk.h" +#include "blk-io-hierarchy/stats.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock); + bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) @@ -1910,7 +1913,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2140,7 +2143,7 @@ static void throtl_downgrade_check(struct throtl_grp *tg) static void blk_throtl_update_idletime(struct throtl_grp *tg) { - unsigned long now = ktime_get_ns() >> 10; + unsigned long now = blk_time_get_ns() >> 10; unsigned long last_finish_time = tg->last_finish_time; if (now <= last_finish_time || last_finish_time == 0 || @@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, tg->last_low_overflow_time[rw] = jiffies; + /* + * This is slow path now, bio_hierarchy_start_io_acct() might spend + * some time to allocate memory. However, it's safe because 'tg' is + * pinned by this bio, and io charge should still be accurate because + * slice is already started from tg_may_dispatch(). + */ + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE); + + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -2403,7 +2420,7 @@ void blk_throtl_bio_endio(struct bio *bio) return; tg = blkg_to_tg(blkg); - finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; @@ -2505,6 +2522,8 @@ void blk_throtl_drain(struct request_queue *q) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock); + bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) while ((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); @@ -2561,6 +2580,8 @@ void blk_throtl_exit(struct request_queue *q) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blk_mq_unregister_hierarchy(q, STAGE_THROTTLE); + free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); @@ -2593,6 +2614,8 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); + + blk_mq_register_hierarchy(q, STAGE_THROTTLE); } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 94b5eff0cd3aea56cae5be436b49da1ca428feac..cf098d2a7262c094345d386f7da8cda87900de0b 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,10 +25,14 @@ #include #ifndef __GENKSYMS__ #include +#include "blk.h" #endif #include "blk-wbt.h" #include "blk-rq-qos.h" +#ifndef __GENKSYMS__ +#include "blk-io-hierarchy/stats.h" +#endif #define CREATE_TRACE_POINTS #include @@ -223,7 +227,7 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); return now - issue; } @@ -533,11 +537,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) + struct bio *bio, spinlock_t *lock) __releases(lock) __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); + unsigned long rw = bio->bi_opf; struct wbt_wait_data data = { .wq = { .func = wbt_wake_function, @@ -554,6 +559,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return; + bio_hierarchy_start_io_acct(bio, STAGE_WBT); has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { @@ -588,6 +594,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, } while (1); finish_wait(&rqw->wait, &data.wq); + bio_hierarchy_end_io_acct(bio, STAGE_WBT); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -652,7 +659,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; } - __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -770,6 +777,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); struct request_queue *q = rqos->q; + blk_mq_unregister_hierarchy(q, STAGE_WBT); blk_stat_remove_callback(q, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); @@ -848,6 +856,7 @@ int wbt_init(struct request_queue *q) blk_mq_unfreeze_queue(q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + blk_mq_register_hierarchy(q, STAGE_WBT); return 0; } diff --git a/block/blk.h b/block/blk.h index 965e9c507654e0dac626289eabffb883e20b16f1..99a57be837654b0722158d73f1f07c75886ea932 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,10 +55,13 @@ struct request_queue_wrapper { int __percpu *last_dispatch_cpu; #endif struct mutex sysfs_dir_lock; +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + struct blk_io_hierarchy_stats *io_hierarchy_stats; +#endif }; -#define queue_to_wrapper(q) \ - container_of(q, struct request_queue_wrapper, q) +#define queue_to_wrapper(__q) \ + container_of((__q), struct request_queue_wrapper, q) extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; @@ -147,6 +150,64 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +#ifdef CONFIG_BLK_BIO_ALLOC_TIME +static inline u64 blk_time_get_ns(void); +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ + if (!rq->q->mq_ops) + return; + + request_to_wrapper(rq)->bi_alloc_time_ns = + first_rq ? request_to_wrapper(first_rq)->bi_alloc_time_ns : + blk_time_get_ns(); +} + +/* + * Used in following cases to updated request bi_alloc_time_ns: + * + * 1) Allocate a new @rq for @bio; + * 2) @bio is merged to @rq, in this case @merged_rq should be NULL; + * 3) @merged_rq is merged to @rq, in this case @bio should be NULL; + */ +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ + struct request_wrapper *rq_wrapper; + struct request_wrapper *merged_rq_wrapper; + + if (!rq->q->mq_ops) + return; + + rq_wrapper = request_to_wrapper(rq); + if (bio) { + if (rq_wrapper->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq_wrapper->bi_alloc_time_ns = bio->bi_alloc_time_ns; + return; + } + + if (WARN_ON_ONCE(!merged_rq)) + return; + + merged_rq_wrapper = request_to_wrapper(merged_rq); + if (rq_wrapper->bi_alloc_time_ns > merged_rq_wrapper->bi_alloc_time_ns) + rq_wrapper->bi_alloc_time_ns = + merged_rq_wrapper->bi_alloc_time_ns; +} +#else /* CONFIG_BLK_BIO_ALLOC_TIME */ +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ +} + +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ +} +#endif /* CONFIG_BLK_BIO_ALLOC_TIME */ + bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, @@ -479,4 +540,28 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q) } #endif +static inline u64 blk_time_get_ns(void) +{ + struct task_struct *tsk = current; + struct blk_plug *plug = tsk->plug; + + if (!plug || !in_task()) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!tsk->_resvd->cur_ktime) { + tsk->_resvd->cur_ktime = ktime_get_ns(); + tsk->flags |= PF_BLOCK_TS; + } + return tsk->_resvd->cur_ktime; +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} #endif /* BLK_INTERNAL_H */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 833e9eaae640bf74612599ba5230ece52a02af56..fe0cb5ab76af6636af7b3feed634c2110d30609f 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -30,6 +30,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-io-hierarchy/stats.h" /* Scheduling domains. */ enum { @@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) blk_stat_add_callback(q, kqd->cb); + blk_mq_register_hierarchy(q, STAGE_KYBER); return 0; } @@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e) struct request_queue *q = kqd->q; int i; + blk_mq_unregister_hierarchy(q, STAGE_KYBER); blk_stat_remove_callback(q, kqd->cb); for (i = 0; i < KYBER_NUM_DOMAINS; i++) @@ -517,7 +520,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio) spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); spin_unlock(&kcq->lock); - blk_mq_put_ctx(ctx); return merged; } @@ -533,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next; + rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER); list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; @@ -584,7 +587,7 @@ static void kyber_completed_request(struct request *rq) if (blk_stat_is_active(kqd->cb)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now < rq->io_start_time_ns) return; @@ -772,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = NULL; out: spin_unlock(&khd->lock); + + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_KYBER); return rq; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 7ad820050675395857cd139d65f44f49135b5668..aa51abb3eaa4e7c6bf3615d120f19ca299319d74 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -22,6 +22,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h" /* * See Documentation/block/deadline-iosched.txt @@ -61,6 +62,8 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; struct list_head dispatch; + + struct request_queue *q; }; static inline struct rb_root * @@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE); return rq; } @@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e) BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE); kfree(dd); } @@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; + dd->q = q; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; + blk_mq_register_hierarchy(q, STAGE_DEADLINE); return 0; } @@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock(&dd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_DEADLINE); blk_mq_free_request(free); + } return ret; } @@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE); blk_mq_free_requests(&free); return; } @@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE); + spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8075b9955bb3cfd00132a618afe5d87ad46d718b..f77227229f58d7a5aa1a516fde7233e070f4c022 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -203,9 +203,21 @@ struct bio { struct bio_set *bi_pool; +#if defined(CONFIG_BLK_BIO_ALLOC_TIME) && !defined(__GENKSYMS__) + u64 bi_alloc_time_ns; +#else KABI_RESERVE(1) +#endif +#if defined(CONFIG_BLK_BIO_ALLOC_TASK) && !defined(__GENKSYMS__) + struct pid *pid; +#else KABI_RESERVE(2) +#endif +#if defined(CONFIG_BLK_IO_HIERARCHY_STATS) && !defined(__GENKSYMS__) + struct bio_hierarchy_data *hdata; +#else KABI_RESERVE(3) +#endif /* * We can inline a number of vecs at the end of the bio, to avoid @@ -220,6 +232,12 @@ struct bio { /* * bio flags */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define BIO_HIERARCHY_ACCT 0 /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ @@ -334,6 +352,9 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + _REQ_HAS_DATA, /* io contain data. */ +#endif __REQ_NR_BITS, /* stops here */ }; @@ -356,6 +377,9 @@ enum req_flag_bits { #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define REQ_HAS_DATA (1UL << _REQ_HAS_DATA) +#endif #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -368,7 +392,36 @@ enum stat_group { STAT_WRITE, STAT_DISCARD, - NR_STAT_GROUPS + NR_STAT_GROUPS, + STAT_FLUSH = NR_STAT_GROUPS, + NR_NEW_STAT_GROUPS, +}; + +enum stage_group { +#ifdef CONFIG_BLK_DEV_THROTTLING + STAGE_THROTTLE, +#endif +#ifdef CONFIG_BLK_WBT + STAGE_WBT, +#endif + STAGE_GETTAG, + NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, +#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) + STAGE_DEADLINE, +#endif +#if IS_ENABLED(CONFIG_IOSCHED_BFQ) + STAGE_BFQ, +#endif +#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) + STAGE_KYBER, +#endif + STAGE_HCTX, + STAGE_REQUEUE, + STAGE_RQ_DRIVER, + NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, }; #define bio_op(bio) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c848f4205729ec722877f2b324ed68a890e50546..241f59eb5b64ad04e579312cf01578798063650b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1435,6 +1435,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + current->_resvd->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -2150,6 +2162,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 26255b76ca525abe952c8932722a8348cd7fb004..d2eceea955b0968d6d90ef2cbb0a55f0386fd974 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -635,6 +635,7 @@ struct task_struct_resvd { #ifdef CONFIG_QOS_SCHED_SMART_GRID struct sched_grid_qos *grid_qos; #endif + u64 cur_ktime; }; struct task_struct { @@ -1495,6 +1496,7 @@ extern struct pid *cad_pid; #define PF_UCE_KERNEL_RECOVERY 0x02000000 /* Task in uce kernel recovery state */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_BLOCK_TS 0x10000000 /* plug has ts that needs updating */ #define PF_IO_WORKER 0x20000000 /* Task is an IO worker */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe9f91f39e2fb6dd9e9ac1718560e93f675d33e1..e37428598155eafe2ef7c233c1d887d585407e7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3581,10 +3581,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } }