diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 4de42999f905a849de7f1eca4874c13ee56d4f6d..71e12eb64467e6792a7ad44f74ed61836c016448 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -822,6 +822,7 @@ CONFIG_BLK_DEBUG_FS=y
 CONFIG_BLK_DEBUG_FS_ZONED=y
 # CONFIG_BLK_SED_OPAL is not set
 # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set
+# CONFIG_BLK_IO_HIERARCHY_STATS is not set
 
 #
 # Partition Types
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 928b4379af4d0c59ef3d76c6cbcc4c984dafaf25..7993f0f3e7a42a4da7fd5a5af5258f793f928e52 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -891,6 +891,7 @@ CONFIG_BLK_WBT_MQ=y
 CONFIG_BLK_DEBUG_FS=y
 # CONFIG_BLK_SED_OPAL is not set
 # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set
+# CONFIG_BLK_IO_HIERARCHY_STATS is not set
 
 #
 # Partition Types
diff --git a/block/Kconfig b/block/Kconfig
index da71e56f8682c029cd7d4b796d19b63fc9fd59a9..8804f21df1519c6bd5ad9678e99cfe98f26b36a7 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,12 @@ menuconfig BLOCK
 
 if BLOCK
 
+config BLK_BIO_ALLOC_TIME
+	bool
+
+config BLK_BIO_ALLOC_TASK
+	bool
+
 config LBDAF
 	bool "Support for large (2TB+) block devices and files"
 	depends on !64BIT
@@ -213,6 +219,8 @@ config BLK_BIO_DISPATCH_ASYNC
 	feature will require special care in the driver to work. If unsure,
 	say N here.
 
+source "block/blk-io-hierarchy/Kconfig"
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 572b33f32c07cf7056fb1121abba753a9ba8a0ac..bb711b0c307a6c45c84a8d6685b7b9dc790e73da 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
 obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
+obj-$(CONFIG_BLK_IO_HIERARCHY_STATS)    += blk-io-hierarchy/
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b663cd8b9e46f68d6854119f2a960f4d4213dcba..25a407e5142dbf8ba8fd7848a197cadad020fb1a 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -22,6 +22,7 @@
 #include <linux/sbitmap.h>
 #include <linux/delay.h>
 
+#include "blk.h"
 #include "bfq-iosched.h"
 
 #if defined(CONFIG_BFQ_GROUP_IOSCHED) &&  defined(CONFIG_DEBUG_BLK_CGROUP)
@@ -60,7 +61,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 	if (!bfqg_stats_waiting(stats))
 		return;
 
-	now = ktime_get_ns();
+	now = blk_time_get_ns();
 	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
@@ -77,7 +78,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 		return;
 	if (bfqg == curr_bfqg)
 		return;
-	stats->start_group_wait_time = ktime_get_ns();
+	stats->start_group_wait_time = blk_time_get_ns();
 	bfqg_stats_mark_waiting(stats);
 }
 
@@ -89,7 +90,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 	if (!bfqg_stats_empty(stats))
 		return;
 
-	now = ktime_get_ns();
+	now = blk_time_get_ns();
 	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
@@ -116,7 +117,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
 	if (bfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = ktime_get_ns();
+	stats->start_empty_time = blk_time_get_ns();
 	bfqg_stats_mark_empty(stats);
 }
 
@@ -125,7 +126,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 	struct bfqg_stats *stats = &bfqg->stats;
 
 	if (bfqg_stats_idling(stats)) {
-		u64 now = ktime_get_ns();
+		u64 now = blk_time_get_ns();
 
 		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
@@ -138,7 +139,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	stats->start_idle_time = ktime_get_ns();
+	stats->start_idle_time = blk_time_get_ns();
 	bfqg_stats_mark_idling(stats);
 }
 
@@ -175,7 +176,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
 				  u64 io_start_time_ns, unsigned int op)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
-	u64 now = ktime_get_ns();
+	u64 now = blk_time_get_ns();
 
 	if (now > io_start_time_ns)
 		blkg_rwstat_add(&stats->service_time, op,
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 473d9e31ff8771793ab1baa07422efc719882c1c..5e94c8cf76b506dd9cf557a8c9c052b649a516ce 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -140,6 +140,7 @@
 #include "blk-mq-sched.h"
 #include "bfq-iosched.h"
 #include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
 
 #define BFQ_BFQQ_FNS(name)						\
 void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)			\
@@ -844,7 +845,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
 
 	rq = rq_entry_fifo(bfqq->fifo.next);
 
-	if (rq == last || ktime_get_ns() < rq->fifo_time)
+	if (rq == last || blk_time_get_ns() < rq->fifo_time)
 		return NULL;
 
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@@ -1566,7 +1567,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 		 * bfq_bfqq_update_budg_for_activation for
 		 * details on the usage of the next variable.
 		 */
-		arrived_in_time =  ktime_get_ns() <=
+		arrived_in_time =  blk_time_get_ns() <=
 			bfqq->ttime.last_end_request +
 			bfqd->bfq_slice_idle * 3;
 
@@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 	ret = blk_mq_sched_try_merge(q, bio, &free);
 
 	spin_unlock_irq(&bfqd->lock);
-	if (free)
+	if (free) {
+		rq_hierarchy_end_io_acct(free, STAGE_BFQ);
 		blk_mq_free_request(free);
+	}
 
 	return ret;
 }
@@ -2468,7 +2471,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
 	else
 		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
 
-	bfqd->last_budget_start = ktime_get();
+	bfqd->last_budget_start = blk_time_get();
 
 	bfqq->budget_timeout = jiffies +
 		bfqd->bfq_timeout * timeout_coeff;
@@ -2568,7 +2571,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	else if (bfqq->wr_coeff > 1)
 		sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
 
-	bfqd->last_idling_start = ktime_get();
+	bfqd->last_idling_start = blk_time_get();
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
 	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
@@ -2605,7 +2608,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
 				       struct request *rq)
 {
 	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
-		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+		bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
 		bfqd->peak_rate_samples = 1;
 		bfqd->sequential_samples = 0;
 		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@@ -2762,7 +2765,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
  */
 static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
 {
-	u64 now_ns = ktime_get_ns();
+	u64 now_ns = blk_time_get_ns();
 
 	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
 		bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@@ -3099,7 +3102,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (compensate)
 		delta_ktime = bfqd->last_idling_start;
 	else
-		delta_ktime = ktime_get();
+		delta_ktime = blk_time_get();
 	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
 	delta_usecs = ktime_to_us(delta_ktime);
 
@@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 			idle_timer_disabled ? in_serv_queue : NULL,
 				idle_timer_disabled);
 
+	if (rq)
+		rq_hierarchy_end_io_acct(rq, STAGE_BFQ);
 	return rq;
 }
 
@@ -4410,7 +4415,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_clear_bfqq_sync(bfqq);
 
 	/* set end request to minus infinity from now */
-	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+	bfqq->ttime.last_end_request = blk_time_get_ns() + 1;
 
 	bfq_mark_bfqq_IO_bound(bfqq);
 
@@ -4528,7 +4533,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 				    struct bfq_queue *bfqq)
 {
 	struct bfq_ttime *ttime = &bfqq->ttime;
-	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+	u64 elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
 
 	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
 
@@ -4697,7 +4702,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 	bfq_add_request(rq);
 	idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
 
-	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	rq->fifo_time = blk_time_get_ns() +
+			bfqd->bfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
 
 	bfq_rq_enqueued(bfqd, bfqq, rq);
@@ -4750,6 +4756,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	spin_lock_irq(&bfqd->lock);
 	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 		spin_unlock_irq(&bfqd->lock);
+		rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ);
 		blk_mq_free_requests(&free);
 		return;
 	}
@@ -4797,6 +4804,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 				struct list_head *list, bool at_head)
 {
+	rq_list_hierarchy_start_io_acct(list, STAGE_BFQ);
 	while (!list_empty(list)) {
 		struct request *rq;
 
@@ -4853,7 +4861,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		bfq_weights_tree_remove(bfqd, bfqq);
 	}
 
-	now_ns = ktime_get_ns();
+	now_ns = blk_time_get_ns();
 
 	bfqq->ttime.last_end_request = now_ns;
 
@@ -5394,6 +5402,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 	struct bfq_queue *bfqq, *n;
 	struct request_queue *q = bfqd->queue;
 
+	blk_mq_unregister_hierarchy(q, STAGE_BFQ);
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
 	spin_lock_irq(&bfqd->lock);
@@ -5560,6 +5569,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
 
 	wbt_disable_default(q);
+	blk_mq_register_hierarchy(q, STAGE_BFQ);
 	return 0;
 
 out_free:
diff --git a/block/bio.c b/block/bio.c
index 06193e854577e1ecff96d66b4f382084ceedd2a5..ff18f6839063016bb5c2d0419c0400b4235afe64 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -33,6 +33,7 @@
 #include <trace/events/block.h>
 #include "blk.h"
 #include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -245,6 +246,14 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
 void bio_uninit(struct bio *bio)
 {
 	bio_disassociate_task(bio);
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK
+	if (bio->pid) {
+		put_pid(bio->pid);
+		bio->pid = NULL;
+	}
+#endif
+	bio_hierarchy_end(bio);
+	bio_free_hierarchy_data(bio);
 }
 EXPORT_SYMBOL(bio_uninit);
 
@@ -285,6 +294,14 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 
 	bio->bi_io_vec = table;
 	bio->bi_max_vecs = max_vecs;
+
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME
+	bio->bi_alloc_time_ns = blk_time_get_ns();
+#endif
+
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK
+	bio->pid = get_pid(task_pid(current));
+#endif
 }
 EXPORT_SYMBOL(bio_init);
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d4a8d8fbe1a0ebe443515c80ccb060ab0d0e8a9e..c0187bf00f714c63671d0459d7d4125f54ec4b26 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1729,7 +1729,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
  */
 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
 {
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 	u64 exp;
 	u64 delay_nsec = 0;
 	int tok;
diff --git a/block/blk-core.c b/block/blk-core.c
index acf5585b055766d1e9c9b557c8550dd1987bb88c..0c74101424dc186b999de4db90f7c306813b7d03 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -43,6 +43,7 @@
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
 
 #ifdef CONFIG_DEBUG_FS
 struct dentry *blk_debugfs_root;
@@ -454,7 +455,7 @@ void __blk_rq_init(struct request_queue *q, struct request *rq)
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = -1;
 	rq->internal_tag = -1;
-	rq->start_time_ns = ktime_get_ns();
+	rq->start_time_ns = blk_time_get_ns();
 	rq->part = NULL;
 }
 
@@ -537,8 +538,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) {
+		req_bio_hierarchy_end(rq, bio);
 		bio_endio(bio);
+	}
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1001,6 +1004,15 @@ void blk_exit_queue(struct request_queue *q)
 	bdi_put(q->backing_dev_info);
 }
 
+static void blk_mq_unregister_default_hierarchy(struct request_queue *q)
+{
+	blk_mq_unregister_hierarchy(q, STAGE_GETTAG);
+	blk_mq_unregister_hierarchy(q, STAGE_PLUG);
+	blk_mq_unregister_hierarchy(q, STAGE_HCTX);
+	blk_mq_unregister_hierarchy(q, STAGE_REQUEUE);
+	blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER);
+}
+
 /**
  * blk_cleanup_queue - shutdown a request queue
  * @q: request queue to shutdown
@@ -1088,6 +1100,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	blk_exit_queue(q);
 
 	if (q->mq_ops) {
+		blk_mq_unregister_default_hierarchy(q);
 		blk_mq_cancel_work_sync(q);
 		blk_mq_exit_queue(q);
 	}
@@ -2106,6 +2119,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
+	blk_rq_update_bi_alloc_time(req, bio, NULL);
 
 	blk_account_io_start(req, false);
 	return true;
@@ -2129,6 +2143,7 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 
 	req->__sector = bio->bi_iter.bi_sector;
 	req->__data_len += bio->bi_iter.bi_size;
+	blk_rq_update_bi_alloc_time(req, bio, NULL);
 
 	blk_account_io_start(req, false);
 	return true;
@@ -2149,6 +2164,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
 	req->nr_phys_segments = segments + 1;
+	blk_rq_update_bi_alloc_time(req, bio, NULL);
 
 	blk_account_io_start(req, false);
 	return true;
@@ -2613,6 +2629,12 @@ generic_make_request_checks(struct bio *bio)
 	 */
 	create_io_context(GFP_ATOMIC, q->node);
 
+	/*
+	 * On the one hand REQ_PREFLUSH | REQ_FUA can be cleared above, on the
+	 * other hand it doesn't make sense to count invalid bio. Split bio will
+	 * be accounted separately.
+	 */
+	bio_hierarchy_start(bio);
 	if (!blkcg_bio_issue_check(q, bio))
 		return false;
 
@@ -2952,7 +2974,7 @@ blk_status_t __blk_insert_cloned_request(struct request_queue *q,
 			u64 now = 0;
 
 			if (blk_mq_need_time_stamp(rq))
-				now = ktime_get_ns();
+				now = blk_time_get_ns();
 
 			blk_account_io_done(rq, now);
 		}
@@ -3304,7 +3326,7 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		req->io_start_time_ns = ktime_get_ns();
+		req->io_start_time_ns = blk_time_get_ns();
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 		req->throtl_size = blk_rq_sectors(req);
 #endif
@@ -3509,7 +3531,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 void blk_finish_request(struct request *req, blk_status_t error)
 {
 	struct request_queue *q = req->q;
-	u64 now = ktime_get_ns();
+	u64 now = blk_time_get_ns();
 
 	lockdep_assert_held(req->q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
@@ -3727,6 +3749,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
+	blk_rq_update_bi_alloc_time(rq, bio, NULL);
 
 	if (bio->bi_disk)
 		rq->rq_disk = bio->bi_disk;
@@ -3923,6 +3946,7 @@ void blk_start_plug(struct blk_plug *plug)
 	 * Store ordering should not be needed here, since a potential
 	 * preempt will imply a full memory barrier
 	 */
+	tsk->_resvd->cur_ktime = 0;
 	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
@@ -4060,6 +4084,9 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
+
+	current->_resvd->cur_ktime = 0;
+	current->flags &= ~PF_BLOCK_TS;
 }
 
 void blk_finish_plug(struct blk_plug *plug)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c1bfcde165af5ae993faa73d890bc8fb06ad9b54..e788e5513c9e0d8729fd724c57bf972214fac701 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -75,6 +75,7 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
+#include "blk-io-hierarchy/stats.h"
 
 /* PREFLUSH/FUA sequences */
 enum {
@@ -187,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq,
 		if (list_empty(pending))
 			fq->flush_pending_since = jiffies;
 		list_move_tail(&rq->flush.list, pending);
+		rq_hierarchy_start_io_acct(rq, STAGE_HCTX);
 		break;
 
 	case REQ_FSEQ_DATA:
@@ -245,6 +247,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		 * avoiding use-after-free.
 		 */
 		WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
+		blk_mq_put_alloc_task(flush_rq);
+		blk_rq_hierarchy_stats_complete(flush_rq);
 		if (fq->rq_status != BLK_STS_OK) {
 			error = fq->rq_status;
 			fq->rq_status = BLK_STS_OK;
@@ -274,6 +278,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		unsigned int seq = blk_flush_cur_seq(rq);
 
 		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+		rq_hierarchy_end_io_acct(rq, STAGE_HCTX);
 		queued |= blk_flush_complete_seq(rq, fq, seq, error);
 	}
 
@@ -378,6 +383,11 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 
+	blk_rq_hierarchy_stats_init(flush_rq);
+	blk_rq_init_bi_alloc_time(flush_rq, first_rq);
+	if (q->mq_ops)
+		blk_mq_get_alloc_task(flush_rq, first_rq->bio);
+
 	/*
 	 * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
 	 * implied in refcount_inc_not_zero() called from
@@ -448,6 +458,8 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 		blk_mq_put_driver_tag_hctx(hctx, rq);
 	}
 
+	blk_rq_hierarchy_set_flush_done(rq);
+
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
@@ -601,7 +613,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 		int node, int cmd_size, gfp_t flags)
 {
 	struct blk_flush_queue *fq;
-	int rq_sz = sizeof(struct request);
+	struct request_wrapper *wrapper;
+	int rq_sz = sizeof(struct request) + sizeof(struct request_wrapper);
 
 	fq = kzalloc_node(sizeof(*fq), flags, node);
 	if (!fq)
@@ -611,10 +624,11 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 		spin_lock_init(&fq->mq_flush_lock);
 
 	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
-	fq->flush_rq = kzalloc_node(rq_sz, flags, node);
-	if (!fq->flush_rq)
+	wrapper = kzalloc_node(rq_sz, flags, node);
+	if (!wrapper)
 		goto fail_rq;
 
+	fq->flush_rq = (struct request *)(wrapper + 1);
 	INIT_LIST_HEAD(&fq->flush_queue[0]);
 	INIT_LIST_HEAD(&fq->flush_queue[1]);
 	INIT_LIST_HEAD(&fq->flush_data_in_flight);
@@ -633,6 +647,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
 	if (!fq)
 		return;
 
-	kfree(fq->flush_rq);
+	kfree(request_to_wrapper(fq->flush_rq));
 	kfree(fq);
 }
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..01019f6aa4252cb9fe3c6c85e5a8d49c96f21443
--- /dev/null
+++ b/block/blk-io-hierarchy/Kconfig
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig BLK_IO_HIERARCHY_STATS
+	bool "Enable hierarchy io stats"
+	default n
+	depends on BLK_DEBUG_FS=y
+	help
+	Enabling this lets the block layer to record additional information
+	in different io stages. Such information can be helpful to debug
+	performance and problems like io hang.
+
+	If unsure, say N.
+
+if BLK_IO_HIERARCHY_STATS
+
+config HIERARCHY_BIO
+	bool "Support to record stats for bio lifetime"
+	default n
+	select BLK_BIO_ALLOC_TIME
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for bio. Such information can be helpful to debug performance and
+	problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_IO_DUMP
+	bool "Support to dump io that is throttled"
+	default n
+	select BLK_BIO_ALLOC_TIME
+	select BLK_BIO_ALLOC_TASK
+	depends on BLK_DEV_IO_TRACE
+	help
+	Enable this will create new debugfs entries to show user the detailed
+	information of IO that are submitted and not done yet, and user can
+	filter the result by IO stage or IO latency.
+
+	If unsure, say N.
+
+config HIERARCHY_THROTTLE
+	bool "Enable hierarchy stats layer blk-throttle"
+	default n
+	depends on BLK_DEV_THROTTLING=y
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for blk-throttle. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_WBT
+	bool "Enable hierarchy stats layer blk-wbt"
+	default n
+	depends on BLK_WBT
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for blk-wbt. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_GETTAG
+	bool "Enable hierarchy stats layer gettag"
+	default n
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for gettag. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_PLUG
+	bool "Enable hierarchy stats layer plug"
+	default n
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for plug. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_DEADLINE
+	bool "Enable hierarchy stats layer mq-deadline"
+	default n
+	depends on MQ_IOSCHED_DEADLINE
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for mq-deadline. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_BFQ
+	bool "Enable hierarchy stats layer bfq"
+	default n
+	depends on IOSCHED_BFQ
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for bfq. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_KYBER
+	bool "Enable hierarchy stats layer kyber"
+	default n
+	depends on MQ_IOSCHED_KYBER
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for kyber. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_HCTX
+	bool "Enable hierarchy stats layer hctx"
+	default n
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for hctx. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_REQUEUE
+	bool "Enable hierarchy stats layer requeue"
+	default n
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for requeue. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+config HIERARCHY_RQ_DRIVER
+	bool "Enable hierarchy stats layer rq_driver"
+	default n
+	help
+	Enabling this lets blk hierarchy stats to record additional information
+	for rq_driver. Such information can be helpful to debug performance
+	and problems like io hang.
+
+	If unsure, say N.
+
+endif
diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9b989d379e5807a5827c62cf62269ec168084df8
--- /dev/null
+++ b/block/blk-io-hierarchy/Makefile
@@ -0,0 +1,8 @@
+#
+# Make file for blk_io_hierarchy_stats
+#
+
+obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o
+
+blk_io_hierarchy_stats-y := stats.o debugfs.o
+obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o
diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c
new file mode 100644
index 0000000000000000000000000000000000000000..29c17e116773b11b2804fbf36e679dc6c69153d3
--- /dev/null
+++ b/block/blk-io-hierarchy/debugfs.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+
+#include "../blk-mq-debugfs.h"
+#include "stats.h"
+#include "iodump.h"
+
+static const char *stage_name[NR_STAGE_GROUPS] = {
+#ifdef CONFIG_HIERARCHY_THROTTLE
+	[STAGE_THROTTLE]	= "throtl",
+#endif
+#ifdef CONFIG_HIERARCHY_WBT
+	[STAGE_WBT]		= "wbt",
+#endif
+#ifdef CONFIG_HIERARCHY_GETTAG
+	[STAGE_GETTAG]		= "gettag",
+#endif
+#ifdef CONFIG_HIERARCHY_PLUG
+	[STAGE_PLUG]		= "plug",
+#endif
+#ifdef CONFIG_HIERARCHY_DEADLINE
+	[STAGE_DEADLINE]        = "deadline",
+#endif
+#ifdef CONFIG_HIERARCHY_BFQ
+	[STAGE_BFQ]	        = "bfq",
+#endif
+#ifdef CONFIG_HIERARCHY_KYBER
+	[STAGE_KYBER]		= "kyber",
+#endif
+#ifdef CONFIG_HIERARCHY_HCTX
+	[STAGE_HCTX]		= "hctx",
+#endif
+#ifdef CONFIG_HIERARCHY_REQUEUE
+	[STAGE_REQUEUE]		= "requeue",
+#endif
+#ifdef CONFIG_HIERARCHY_RQ_DRIVER
+	[STAGE_RQ_DRIVER]	= "rq_driver",
+#endif
+#ifdef CONFIG_HIERARCHY_BIO
+	[STAGE_BIO]		= "bio",
+#endif
+};
+
+const char *hierarchy_stage_name(enum stage_group stage)
+{
+	return stage_name[stage];
+}
+
+static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data,
+				  struct seq_file *m, enum stage_group stage)
+{
+	u64 dispatched[NR_NEW_STAT_GROUPS] = {0};
+	u64 completed[NR_NEW_STAT_GROUPS] = {0};
+	u64 latency[NR_NEW_STAT_GROUPS] = {0};
+	int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		struct hierarchy_stats *stat =
+			per_cpu_ptr(hstats_data->hstats, cpu);
+
+		for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) {
+			dispatched[i] += stat->dispatched[i];
+			completed[i] += stat->completed[i];
+			latency[i] += stage_is_rq(stage) ?
+				      stat->jiffies[i] : stat->nsecs[i];
+		}
+	}
+
+	if (stage_is_rq(stage))
+		for (i = 0; i < NR_NEW_STAT_GROUPS; ++i)
+			latency[i] =
+				jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC;
+
+	seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
+		   dispatched[STAT_READ], completed[STAT_READ],
+		   latency[STAT_READ], dispatched[STAT_WRITE],
+		   completed[STAT_WRITE], latency[STAT_WRITE],
+		   dispatched[STAT_DISCARD], completed[STAT_DISCARD],
+		   latency[STAT_DISCARD], dispatched[STAT_FLUSH],
+		   completed[STAT_FLUSH], latency[STAT_FLUSH]);
+
+	hierarchy_show_slow_io(hstats_data, m);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos)
+{
+	enum stage_group stage = *pos;
+
+	if (stage < 0 || stage >= NR_STAGE_GROUPS)
+		return NULL;
+
+	return pos;
+}
+
+static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	enum stage_group stage = ++(*pos);
+
+	if (stage >= 0 && stage < NR_STAGE_GROUPS)
+		return pos;
+
+	return NULL;
+}
+
+static void hierarchy_stats_stop(struct seq_file *m, void *v)
+{
+}
+
+static int hierarchy_stats_show(struct seq_file *m, void *v)
+{
+	enum stage_group stage = (*(loff_t *)v);
+	struct blk_io_hierarchy_stats *stats = m->private;
+	struct hierarchy_stats_data *hstats_data =
+				get_hstats_data(stats, stage);
+
+	if (!hstats_data)
+		return 0;
+
+	seq_printf(m, "%s ", hierarchy_stage_name(stage));
+	__hierarchy_stats_show(hstats_data, m, stage);
+	put_hstats_data(stats, hstats_data);
+	return 0;
+}
+
+static const struct seq_operations hierarchy_stats_ops = {
+	.start	= hierarchy_stats_start,
+	.next	= hierarchy_stats_next,
+	.stop	= hierarchy_stats_stop,
+	.show	= hierarchy_stats_show,
+};
+
+static int hierarchy_stats_show_single(void *v, struct seq_file *m)
+{
+	struct hierarchy_stage *hstage = v;
+
+	return __hierarchy_stats_show(hstage->hstats_data, m, hstage->stage);
+}
+
+static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = {
+	{"stats", 0400, hierarchy_stats_show_single},
+	{},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = {
+	{"stats", 0400, .seq_ops = &hierarchy_stats_ops},
+	{},
+};
+
+static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats,
+				     enum stage_group stage)
+{
+	struct hierarchy_stage *hstage = stats->hstage[stage];
+	struct dentry *dir;
+
+	if (!stage_name[stage] || hstage->debugfs_dir)
+		return;
+
+	dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir);
+	if (IS_ERR(dir))
+		return;
+
+	hstage->debugfs_dir = dir;
+	debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs);
+	io_hierarchy_register_iodump(hstage);
+}
+
+static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats,
+				       enum stage_group stage)
+{
+	struct hierarchy_stage *hstage = stats->hstage[stage];
+
+	if (!stage_name[stage] || !hstage->debugfs_dir)
+		return;
+
+	debugfs_remove_recursive(hstage->debugfs_dir);
+	hstage->debugfs_dir = NULL;
+}
+
+void blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+				       enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+
+	if (!blk_mq_hierarchy_registered(q, stage) ||
+	    !blk_mq_debugfs_enabled(q))
+		return;
+
+	hierarchy_register_stage(stats, stage);
+}
+
+void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+					 enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+
+	if (!blk_mq_hierarchy_registered(q, stage) ||
+	    !blk_mq_debugfs_enabled(q))
+		return;
+
+	hierarchy_unregister_stage(stats, stage);
+}
+
+void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+
+	if (!blk_mq_debugfs_enabled(q))
+		return;
+
+	debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr);
+}
diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c
new file mode 100644
index 0000000000000000000000000000000000000000..18bd813665f8d53bc2eecd7c6c5bf769fa24f68b
--- /dev/null
+++ b/block/blk-io-hierarchy/iodump.c
@@ -0,0 +1,756 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
+#include <linux/sched/task.h>
+
+#include "iodump.h"
+#include "../blk.h"
+#include "../blk-mq-debugfs.h"
+
+#define RWB_LEN 6
+#define PATH_LEN 64
+#define ms_to_ns(time) (time * NSEC_PER_MSEC)
+#define DEFAULT_THRESHOLD 1000
+
+static DEFINE_MUTEX(dump_mutex);
+
+struct bio_dump_data {
+	u64 stat_time;
+	struct list_head head;
+	spinlock_t lock;
+};
+
+struct rq_dump_data {
+	struct request_queue *q;
+	enum stage_group stage;
+	unsigned int tag;
+	unsigned int total_tags;
+	bool has_elevator;
+	bool enter_queue;
+};
+
+#ifdef CONFIG_HIERARCHY_BIO
+struct pos_data {
+	enum stage_group stage;
+	unsigned int count;
+};
+
+struct bio_stage_dump_data {
+	union {
+		loff_t pos;
+		struct pos_data pdata;
+	};
+	struct rq_dump_data rq_ddata;
+	u64 stat_time;
+};
+#endif
+
+int blk_io_hierarchy_iodump_init(struct request_queue *q,
+				 struct hierarchy_stage *hstage)
+{
+	hstage->threshold = DEFAULT_THRESHOLD;
+
+	if (stage_is_bio(hstage->stage)) {
+		struct bio_dump_data *bio_ddata =
+			kmalloc(sizeof(*bio_ddata), GFP_KERNEL);
+
+		if (!bio_ddata)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&bio_ddata->head);
+		spin_lock_init(&bio_ddata->lock);
+		hstage->dump_data = bio_ddata;
+		return 0;
+	}
+
+	if (stage_is_rq(hstage->stage)) {
+		struct rq_dump_data *rq_ddata =
+			kzalloc(sizeof(*rq_ddata), GFP_KERNEL);
+
+		if (!rq_ddata)
+			return -ENOMEM;
+
+		rq_ddata->q = q;
+		rq_ddata->stage = hstage->stage;
+		hstage->dump_data = rq_ddata;
+		return 0;
+	}
+
+#ifdef CONFIG_HIERARCHY_BIO
+	BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t));
+
+	if (hstage->stage == STAGE_BIO) {
+		struct bio_stage_dump_data *bstage_ddata =
+			kzalloc(sizeof(*bstage_ddata), GFP_KERNEL);
+
+		if (!bstage_ddata)
+			return -ENOMEM;
+
+		bstage_ddata->rq_ddata.q = q;
+		bstage_ddata->rq_ddata.stage = hstage->stage;
+		hstage->dump_data = bstage_ddata;
+		return 0;
+	}
+#endif
+
+	return -EINVAL;
+}
+
+void blk_io_hierarchy_iodump_exit(struct request_queue *q,
+				  enum stage_group stage)
+{
+	struct hierarchy_stage *hstage =
+		queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage];
+
+	if (stage_is_bio(hstage->stage)) {
+		struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+		WARN(!list_empty(&bio_ddata->head),
+		     "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n",
+		     kobject_name(q->kobj.parent), hierarchy_stage_name(stage));
+	}
+
+	kfree(hstage->dump_data);
+	hstage->dump_data = NULL;
+}
+
+void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+	unsigned long flags;
+	struct bio_hierarchy_data *data = bio->hdata;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	spin_lock_irqsave(&bio_ddata->lock, flags);
+	list_add_tail(&data->hierarchy_list, &bio_ddata->head);
+	spin_unlock_irqrestore(&bio_ddata->lock, flags);
+}
+
+void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+	unsigned long flags;
+	struct bio_hierarchy_data *data = bio->hdata;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	spin_lock_irqsave(&bio_ddata->lock, flags);
+	list_del_init(&data->hierarchy_list);
+	spin_unlock_irqrestore(&bio_ddata->lock, flags);
+}
+
+void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata)
+{
+	hdata->bio = bio;
+	INIT_LIST_HEAD(&hdata->hierarchy_list);
+}
+
+static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos)
+	 __acquires(&bio_ddata->lock)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	spin_lock_irq(&bio_ddata->lock);
+	bio_ddata->stat_time = blk_time_get_ns();
+
+	return seq_list_start(&bio_ddata->head, *pos);
+}
+
+static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	return seq_list_next(v, &bio_ddata->head, pos);
+}
+
+static void bio_hierarchy_list_stop(struct seq_file *m, void *v)
+	__releases(&hstage->lock)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	spin_unlock_irq(&bio_ddata->lock);
+}
+
+static void __hierarchy_show_bio(struct seq_file *m,
+				 struct bio_hierarchy_data *data,
+				 enum stage_group stage, u64 duration)
+{
+	char rwbs[RWB_LEN];
+	char path[PATH_LEN] = {0};
+	struct bio *bio = data->bio;
+	struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID);
+
+	blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+#ifdef CONFIG_BLK_CGROUP
+	cgroup_path(bio->bi_css->cgroup, path, PATH_LEN);
+#endif
+
+	seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n",
+		   task ? task->comm : "null", task ? task->pid : 0,
+		   bio->bi_disk->disk_name, hierarchy_stage_name(stage),
+		   rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path,
+		   duration);
+
+	if (task)
+		put_task_struct(task);
+}
+
+static u64 get_duration(u64 a, u64 b)
+{
+	return a > b ? a - b : 0;
+}
+
+static void hierarchy_show_bio(struct seq_file *m,
+			       struct bio_hierarchy_data *data)
+{
+	u64 duration;
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+	duration = get_duration(bio_ddata->stat_time, data->time);
+	if (hstage->threshold > ns_to_ms(duration))
+		return;
+
+	__hierarchy_show_bio(m, data, hstage->stage, duration);
+}
+
+static int bio_hierarchy_list_show(struct seq_file *m, void *v)
+{
+	struct bio_hierarchy_data *data =
+		list_entry(v, struct bio_hierarchy_data, hierarchy_list);
+
+	hierarchy_show_bio(m, data);
+	return 0;
+}
+
+static const struct seq_operations hierarchy_bio_dump_ops = {
+	.start	= bio_hierarchy_list_start,
+	.next	= bio_hierarchy_list_next,
+	.stop	= bio_hierarchy_list_stop,
+	.show	= bio_hierarchy_list_show,
+};
+
+static int threshold_show(void *data, struct seq_file *m)
+{
+	struct hierarchy_stage *hstage = data;
+
+	seq_printf(m, "%lu\n", hstage->threshold);
+	return 0;
+}
+
+/*
+ * max size needed by different bases to express U64
+ * HEX: "0xFFFFFFFFFFFFFFFF" --> 18
+ * DEC: "18446744073709551615" --> 20
+ * OCT: "01777777777777777777777" --> 23
+ * pick the max one to define NUMBER_BUF_LEN
+ */
+#define MAX_BUF_LEN 24
+static ssize_t threshold_store(void *data, const char __user *buf, size_t count,
+			       loff_t *ppos)
+{
+	int err;
+	unsigned long val;
+	char b[MAX_BUF_LEN + 1];
+	struct hierarchy_stage *hstage = data;
+
+	if (count > MAX_BUF_LEN)
+		return -EINVAL;
+
+	if (copy_from_user(b, buf, count))
+		return -EFAULT;
+
+	b[count] = 0;
+	err = kstrtoul(b, 0, &val);
+	if (!err)
+		hstage->threshold = val;
+
+	return err ? err : count;
+}
+
+static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata)
+{
+	struct request_queue *q = rq_ddata->q;
+
+	rq_ddata->has_elevator = !!q->elevator;
+
+	if (rq_ddata->has_elevator)
+		rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests;
+	else
+		rq_ddata->total_tags = q->nr_hw_queues *
+				       q->tag_set->queue_depth;
+}
+
+static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata,
+				 unsigned int tag)
+{
+	/*
+	 * Grab .q_usage_counter so request pool won't go away, then no
+	 * request use-after-free is possible during iteration. If queue is
+	 * frozen, there won't be any inflight requests.
+	 */
+	if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) {
+		rq_ddata->enter_queue = false;
+		return false;
+	}
+
+	rq_ddata->enter_queue = true;
+	rq_hierarchy_init_dump_data(rq_ddata);
+	rq_ddata->tag = tag;
+
+	return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues;
+}
+
+static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata)
+{
+	rq_ddata->tag++;
+
+	return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues;
+}
+
+static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata)
+{
+	if (rq_ddata->enter_queue) {
+		percpu_ref_put(&rq_ddata->q->q_usage_counter);
+		rq_ddata->enter_queue = false;
+	}
+}
+
+static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos)
+	 __acquires(&dump_mutex)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct rq_dump_data *rq_ddata = hstage->dump_data;
+
+	mutex_lock(&dump_mutex);
+
+	if (__rq_hierarchy_start(rq_ddata, *pos))
+		return rq_ddata;
+
+	return NULL;
+}
+
+static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct rq_dump_data *rq_ddata = v;
+
+	if (__rq_hierarchy_next(rq_ddata)) {
+		*pos = rq_ddata->tag;
+		return rq_ddata;
+	}
+
+	(*pos)++;
+	return NULL;
+}
+
+static void rq_hierarchy_stop(struct seq_file *m, void *v)
+	__releases(&dump_mutex)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct rq_dump_data *rq_ddata = hstage->dump_data;
+
+	__rq_hierarchy_stop(rq_ddata);
+	mutex_unlock(&dump_mutex);
+}
+
+static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata)
+{
+	struct request *rq;
+	struct request_wrapper *rq_wrapper;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q = rq_ddata->q;
+	unsigned int nr_tag = rq_ddata->tag;
+	unsigned int hctx_id;
+
+	if (nr_tag >= rq_ddata->total_tags) {
+		hctx_id = nr_tag - rq_ddata->total_tags;
+		if (hctx_id >= q->nr_hw_queues)
+			return NULL;
+
+		hctx = q->queue_hw_ctx[hctx_id];
+		rq = hctx->fq->flush_rq;
+	} else if (rq_ddata->has_elevator) {
+		hctx_id = nr_tag / q->nr_requests;
+		if (hctx_id >= q->nr_hw_queues)
+			return NULL;
+
+		hctx = q->queue_hw_ctx[hctx_id];
+		rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests];
+	} else {
+		hctx_id = nr_tag / q->tag_set->queue_depth;
+		if (hctx_id >= q->nr_hw_queues)
+			return NULL;
+
+		hctx = q->queue_hw_ctx[hctx_id];
+		if (!hctx->tags)
+			return NULL;
+
+		rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth];
+	}
+
+	rq_wrapper = request_to_wrapper(rq);
+	/*
+	 * fast path to avoid refcount cas operations for the request that
+	 * is from other shared request_queue or other stages.
+	 */
+	if (rq->q != q || (rq_ddata->stage != STAGE_BIO &&
+			   READ_ONCE(rq_wrapper->stage) != rq_ddata->stage))
+		return NULL;
+
+	if (!refcount_inc_not_zero(&rq->ref))
+		return NULL;
+
+	/* Check again after request is pinned, in case request is resued. */
+	if (rq->q != q) {
+		blk_mq_put_rq_ref(rq);
+		return NULL;
+	}
+
+	if (rq_ddata->stage == STAGE_BIO)
+		return rq;
+
+	/*
+	 * Barrier is paired with the smp_store_release() in
+	 * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized
+	 * hierarchy_time won't be read.
+	 */
+	if (smp_load_acquire(&rq_wrapper->stage) != rq_ddata->stage) {
+		blk_mq_put_rq_ref(rq);
+		return NULL;
+	}
+
+	return rq;
+}
+
+static void hierarchy_show_rq(struct seq_file *m, struct request *rq,
+			      u64 duration)
+{
+	struct request_wrapper *rq_wrapper = request_to_wrapper(rq);
+	struct task_struct *task = get_pid_task(rq_wrapper->pid, PIDTYPE_PID);
+	const char *name = hierarchy_stage_name(rq_wrapper->stage);
+
+	seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null",
+		   task ? task->pid : 0,
+		   rq->rq_disk ? rq->rq_disk->disk_name : "?",
+		   name ? name : "?");
+	debugfs_rq_show(m, rq);
+	seq_printf(m, " started %llu ns ago}\n", duration);
+
+	if (task)
+		put_task_struct(task);
+}
+
+static int rq_hierarchy_show(struct seq_file *m, void *v)
+{
+	u64 duration;
+	unsigned long htime;
+	struct hierarchy_stage *hstage = m->private;
+	struct request_wrapper *rq_wrapper;
+	struct request *rq = hierarchy_find_and_get_rq(v);
+
+	if (!rq)
+		return 0;
+
+	rq_wrapper = request_to_wrapper(rq);
+	htime = READ_ONCE(rq_wrapper->hierarchy_time);
+	htime = time_after(jiffies, htime) ? jiffies - htime : 0;
+	duration = jiffies_to_msecs(htime);
+	if (hstage->threshold <= duration)
+		hierarchy_show_rq(m, rq, ms_to_ns(duration));
+
+	blk_mq_put_rq_ref(rq);
+	return 0;
+}
+
+static const struct seq_operations hierarchy_rq_dump_ops = {
+	.start	= rq_hierarchy_start,
+	.next	= rq_hierarchy_next,
+	.stop	= rq_hierarchy_stop,
+	.show	= rq_hierarchy_show,
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = {
+	{
+		"threshold",
+		0600,
+		threshold_show,
+		threshold_store,
+	},
+	{},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = {
+	{
+		"io_dump",
+		0400,
+		.seq_ops = &hierarchy_bio_dump_ops,
+	},
+	{},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = {
+	{
+		"io_dump",
+		0400,
+		.seq_ops = &hierarchy_rq_dump_ops,
+	},
+	{},
+};
+
+#ifdef CONFIG_HIERARCHY_BIO
+static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q,
+						 enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+	struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]);
+
+	if (!hstage)
+		return NULL;
+
+	return hstage->dump_data;
+}
+
+static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata,
+				       loff_t *pos)
+{
+	struct pos_data *pdata = &bstage_ddata->pdata;
+
+	pdata->stage++;
+	if (!stage_is_bio(pdata->stage))
+		pdata->stage = STAGE_BIO;
+	pdata->count = 0;
+
+	*pos = bstage_ddata->pos;
+}
+
+static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata,
+				    loff_t *pos)
+{
+	struct pos_data *pdata = &bstage_ddata->pdata;
+
+	if (stage_is_bio(pdata->stage))
+		pdata->count++;
+	else
+		pdata->count = bstage_ddata->rq_ddata.tag;
+
+	*pos = bstage_ddata->pos;
+}
+
+static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata)
+{
+	struct pos_data *pdata = &bstage_ddata->pdata;
+	struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+
+	if (stage_is_bio(pdata->stage)) {
+		struct bio_dump_data *bio_ddata =
+			get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+
+		spin_unlock_irq(&bio_ddata->lock);
+	}
+
+	if (rq_ddata->enter_queue) {
+		percpu_ref_put(&rq_ddata->q->q_usage_counter);
+		rq_ddata->enter_queue = false;
+	}
+}
+
+void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata,
+				 loff_t *pos)
+{
+	struct pos_data *pdata = &bstage_ddata->pdata;
+	struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+
+retry:
+	if (stage_is_bio(pdata->stage)) {
+		struct list_head *list;
+		struct bio_dump_data *bio_ddata =
+			get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+
+		if (!bio_ddata) {
+			bio_stage_start_next_stage(bstage_ddata, pos);
+			goto retry;
+		}
+
+		spin_lock_irq(&bio_ddata->lock);
+		list = seq_list_start(&bio_ddata->head, pdata->count);
+		if (list)
+			return list;
+
+		spin_unlock_irq(&bio_ddata->lock);
+		bio_stage_start_next_stage(bstage_ddata, pos);
+		goto retry;
+	}
+
+	if (pdata->stage == STAGE_BIO &&
+	    __rq_hierarchy_start(rq_ddata, pdata->count))
+		return bstage_ddata;
+
+	return NULL;
+}
+
+static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+
+	mutex_lock(&dump_mutex);
+	bstage_ddata->pos = *pos;
+	bstage_ddata->stat_time = blk_time_get_ns();
+
+	return __bio_stage_hierarchy_start(bstage_ddata, pos);
+}
+
+static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+	struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+	struct pos_data *pdata = &bstage_ddata->pdata;
+
+	if (stage_is_bio(pdata->stage)) {
+		struct bio_dump_data *bio_ddata =
+			get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+		struct list_head *list = ((struct list_head *)v)->next;
+
+		if (list != &bio_ddata->head) {
+			bio_stage_start_next_io(bstage_ddata, pos);
+			return list;
+		}
+
+		spin_unlock_irq(&bio_ddata->lock);
+
+		bio_stage_start_next_stage(bstage_ddata, pos);
+		return __bio_stage_hierarchy_start(bstage_ddata, pos);
+	}
+
+	if (pdata->stage == STAGE_BIO &&
+	    __rq_hierarchy_next(rq_ddata)) {
+		bio_stage_start_next_io(bstage_ddata, pos);
+		return bstage_ddata;
+	}
+
+	(*pos)++;
+	return NULL;
+}
+
+static void bio_stage_hierarchy_stop(struct seq_file *m, void *v)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+
+	__bio_stage_hierarchy_stop(bstage_ddata);
+	mutex_unlock(&dump_mutex);
+}
+
+static int bio_stage_hierarchy_show(struct seq_file *m, void *v)
+{
+	struct hierarchy_stage *hstage = m->private;
+	struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+	struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+	struct pos_data *pdata = &bstage_ddata->pdata;
+	u64 duration;
+
+	if (stage_is_bio(pdata->stage)) {
+		struct bio_hierarchy_data *data = list_entry(
+				v, struct bio_hierarchy_data, hierarchy_list);
+
+		duration = get_duration(bstage_ddata->stat_time,
+					data->bio->bi_alloc_time_ns);
+		if (hstage->threshold <= ns_to_ms(duration))
+			__hierarchy_show_bio(m, data, pdata->stage, duration);
+	} else if (pdata->stage == STAGE_BIO) {
+		struct request *rq = hierarchy_find_and_get_rq(rq_ddata);
+
+		if (rq) {
+			duration = get_duration(bstage_ddata->stat_time,
+				request_to_wrapper(rq)->bi_alloc_time_ns);
+			if (hstage->threshold <= ns_to_ms(duration))
+				hierarchy_show_rq(m, rq, duration);
+			blk_mq_put_rq_ref(rq);
+		}
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bio_stage_hierarchy_ops = {
+	.start  = bio_stage_hierarchy_start,
+	.next   = bio_stage_hierarchy_next,
+	.stop   = bio_stage_hierarchy_stop,
+	.show   = bio_stage_hierarchy_show,
+};
+
+static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = {
+	{
+		"io_dump",
+		0400,
+		.seq_ops = &bio_stage_hierarchy_ops,
+	},
+	{},
+};
+
+#else /* CONFIG_HIERARCHY_BIO */
+static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = {
+	{},
+};
+
+#endif
+
+void io_hierarchy_register_iodump(struct hierarchy_stage *hstage)
+{
+	const struct blk_mq_debugfs_attr *attr;
+
+	if (stage_is_bio(hstage->stage))
+		attr = hierarchy_bio_dump_attr;
+	else if (stage_is_rq(hstage->stage))
+		attr = hierarchy_rq_dump_attr;
+	else if (hstage->stage == STAGE_BIO)
+		attr = bio_stage_dump_attr;
+	else
+		attr = NULL;
+
+	debugfs_create_files(hstage->debugfs_dir, hstage,
+			     hierarchy_threshold_attr);
+	if (attr)
+		debugfs_create_files(hstage->debugfs_dir, hstage, attr);
+}
+
+void hierarchy_account_slow_io(struct hierarchy_stage *hstage,
+			       enum stat_group op, unsigned long duration)
+{
+	if (hstage->threshold <= duration)
+		this_cpu_inc(hstage->hstats_data->hstats->slow[op]);
+}
+
+void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data,
+			    struct seq_file *m)
+{
+	u64 slow[NR_NEW_STAT_GROUPS] = {0};
+	int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		struct hierarchy_stats *stat =
+			per_cpu_ptr(hstats_data->hstats, cpu);
+
+		for (i = 0; i < NR_NEW_STAT_GROUPS; ++i)
+			slow[i] += stat->slow[i];
+	}
+
+	seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE],
+		   slow[STAT_DISCARD], slow[STAT_FLUSH]);
+}
diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8ef0d8669f621bf2c97728ed5c4d7fa676d7e66
--- /dev/null
+++ b/block/blk-io-hierarchy/iodump.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef BLK_IO_HIERARCHY_IODUMP_H
+#define BLK_IO_HIERARCHY_IODUMP_H
+
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+
+#include "stats.h"
+
+#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC)
+
+int blk_io_hierarchy_iodump_init(struct request_queue *q,
+				 struct hierarchy_stage *hstage);
+void blk_io_hierarchy_iodump_exit(struct request_queue *q,
+				  enum stage_group stage);
+void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio);
+void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio);
+void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata);
+void io_hierarchy_register_iodump(struct hierarchy_stage *hstage);
+
+void hierarchy_account_slow_io(struct hierarchy_stage *hstage,
+			       enum stat_group op, unsigned long duration);
+void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data,
+			    struct seq_file *m);
+
+static inline void
+hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage,
+			     enum stat_group op, u64 duration)
+{
+	hierarchy_account_slow_io(hstage, op, ns_to_ms(duration));
+}
+
+static inline void
+hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage,
+				  enum stat_group op, unsigned long duration)
+{
+	hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration));
+}
+
+#else
+static inline int
+blk_io_hierarchy_iodump_init(struct request_queue *q,
+			     struct hierarchy_stage *hstage)
+{
+	return 0;
+}
+
+static inline void
+blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+}
+
+static inline void
+hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+}
+
+static inline void
+bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata)
+{
+}
+
+static inline void
+io_hierarchy_register_iodump(struct hierarchy_stage *hstage)
+{
+}
+
+static inline void
+hierarchy_account_slow_io(struct hierarchy_stage *hstage,
+			  enum stat_group op, unsigned long duration)
+{
+}
+
+static inline void
+hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage,
+			     enum stat_group op, u64 duration)
+{
+}
+
+static inline void
+hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage,
+				  enum stat_group op, unsigned long duration)
+{
+}
+
+#endif
+#endif
diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c
new file mode 100644
index 0000000000000000000000000000000000000000..b9e79b43514951945526fe20e3064c8ce5ef05e5
--- /dev/null
+++ b/block/blk-io-hierarchy/stats.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "stats.h"
+#include "iodump.h"
+#include "../blk.h"
+#include "../blk-mq-debugfs.h"
+
+#define io_hierarchy_add(statsp, field, group, nr) \
+	this_cpu_add((statsp)->hstats->field[group], nr)
+#define io_hierarchy_inc(statsp, field, group) \
+	io_hierarchy_add(statsp, field, group, 1)
+
+#define PRE_ALLOC_BIO_CNT 8
+
+static mempool_t *hdata_pool;
+
+void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
+{
+	struct blk_io_hierarchy_stats *stats;
+	enum stage_group stage;
+
+	stats = queue_to_wrapper(q)->io_hierarchy_stats;
+	if (!stats || !blk_mq_debugfs_enabled(q))
+		return;
+
+	stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy",
+						q->debugfs_dir);
+	blk_mq_debugfs_create_default_hierarchy_attr(q);
+
+	for (stage = 0; stage < NR_STAGE_GROUPS; ++stage)
+		blk_mq_debugfs_register_hierarchy(q, stage);
+}
+
+static void bio_alloc_hierarchy_data(struct bio *bio)
+{
+	if (!bio->hdata) {
+		struct bio_hierarchy_data *hdata =
+					mempool_alloc(hdata_pool, GFP_NOIO);
+
+		bio_hierarchy_data_init(bio, hdata);
+		bio->hdata = hdata;
+	}
+}
+
+void bio_free_hierarchy_data(struct bio *bio)
+{
+	if (!bio->hdata)
+		return;
+
+	mempool_free(bio->hdata, hdata_pool);
+	bio->hdata = NULL;
+}
+
+int blk_io_hierarchy_stats_alloc(struct request_queue *q)
+{
+	struct blk_io_hierarchy_stats *stats;
+
+	if (!q->mq_ops)
+		return 0;
+
+	stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL);
+	if (!stats)
+		return -ENOMEM;
+
+	spin_lock_init(&stats->hstage_lock);
+	stats->q = q;
+	queue_to_wrapper(q)->io_hierarchy_stats = stats;
+
+	return 0;
+}
+
+void blk_io_hierarchy_stats_free(struct request_queue *q)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+
+	if (!stats)
+		return;
+
+	queue_to_wrapper(q)->io_hierarchy_stats = NULL;
+	kfree(stats);
+}
+
+bool blk_mq_hierarchy_registered(struct request_queue *q,
+				 enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+
+	if (!stats)
+		return false;
+
+	return stats->hstage[stage] != NULL;
+}
+EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered);
+
+static struct hierarchy_stats_data *alloc_hstats_data(void)
+{
+	struct hierarchy_stats_data *hstats_data;
+
+	hstats_data = kmalloc(sizeof(*hstats_data), GFP_KERNEL);
+	if (!hstats_data)
+		return NULL;
+
+	hstats_data->hstats = alloc_percpu(struct hierarchy_stats);
+	if (!hstats_data->hstats) {
+		kfree(hstats_data);
+		return NULL;
+	}
+
+	hstats_data->ref = 1;
+	return hstats_data;
+}
+
+struct hierarchy_stats_data *get_hstats_data(
+		struct blk_io_hierarchy_stats *stats,
+		enum stage_group stage)
+{
+	struct hierarchy_stage *hstage;
+	struct hierarchy_stats_data *hstats_data = NULL;
+
+	spin_lock(&stats->hstage_lock);
+	hstage = stats->hstage[stage];
+	if (hstage) {
+		hstats_data = hstage->hstats_data;
+		if (hstats_data)
+			hstats_data->ref++;
+	}
+	spin_unlock(&stats->hstage_lock);
+
+	return hstats_data;
+}
+
+static void __put_hstats_data(struct blk_io_hierarchy_stats *stats,
+			      struct hierarchy_stats_data *hstats_data)
+{
+	if (--hstats_data->ref == 0) {
+		free_percpu(hstats_data->hstats);
+		kfree(hstats_data);
+	}
+}
+
+void put_hstats_data(struct blk_io_hierarchy_stats *stats,
+		     struct hierarchy_stats_data *hstats_data)
+{
+	spin_lock(&stats->hstage_lock);
+	__put_hstats_data(stats, hstats_data);
+	spin_unlock(&stats->hstage_lock);
+}
+
+void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+	struct hierarchy_stage *hstage;
+
+	if (!stats || !hierarchy_stage_name(stage))
+		return;
+
+	if (blk_mq_hierarchy_registered(q, stage)) {
+		pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.",
+			kobject_name(q->kobj.parent),
+			hierarchy_stage_name(stage));
+		return;
+	}
+
+	/*
+	 * Alloc memory before freeze queue, prevent deadlock if new IO is
+	 * issued by memory reclaim.
+	 */
+	hstage = kmalloc(sizeof(*hstage), GFP_KERNEL);
+	if (!hstage)
+		return;
+
+	hstage->hstats_data = alloc_hstats_data();
+	if (!hstage->hstats_data) {
+		kfree(hstage);
+		return;
+	}
+
+	hstage->stage = stage;
+	hstage->unbalanced_warned = false;
+	hstage->debugfs_dir = NULL;
+	if (blk_io_hierarchy_iodump_init(q, hstage) < 0) {
+		put_hstats_data(stats, hstage->hstats_data);
+		kfree(hstage);
+		return;
+	}
+
+	blk_mq_freeze_queue(q);
+
+	WRITE_ONCE(stats->hstage[stage], hstage);
+	blk_mq_debugfs_register_hierarchy(q, stage);
+
+	blk_mq_unfreeze_queue(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy);
+
+void blk_mq_unregister_hierarchy(struct request_queue *q,
+				 enum stage_group stage)
+{
+	struct blk_io_hierarchy_stats *stats =
+		queue_to_wrapper(q)->io_hierarchy_stats;
+	struct hierarchy_stage *hstage;
+
+	if (!blk_mq_hierarchy_registered(q, stage))
+		return;
+
+	blk_mq_debugfs_unregister_hierarchy(q, stage);
+	blk_io_hierarchy_iodump_exit(q, stage);
+
+	spin_lock(&stats->hstage_lock);
+	hstage = stats->hstage[stage];
+	stats->hstage[stage] = NULL;
+	__put_hstats_data(stats, hstage->hstats_data);
+	spin_unlock(&stats->hstage_lock);
+
+	kfree(hstage);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy);
+
+static enum stat_group bio_hierarchy_op(struct bio *bio)
+{
+	if (op_is_discard(bio->bi_opf))
+		return STAT_DISCARD;
+
+	if (op_is_flush(bio->bi_opf) &&
+	    !(bio_sectors(bio) || (bio->bi_opf & REQ_HAS_DATA)))
+		return STAT_FLUSH;
+
+	if (op_is_write(bio->bi_opf))
+		return STAT_WRITE;
+
+	return STAT_READ;
+}
+
+
+void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	struct hierarchy_stage *hstage;
+
+	if (!blk_mq_hierarchy_registered(q, stage))
+		return;
+
+	hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage];
+	bio_alloc_hierarchy_data(bio);
+	io_hierarchy_inc(hstage->hstats_data, dispatched,
+			 bio_hierarchy_op(bio));
+	bio->hdata->time = blk_time_get_ns();
+	hierarchy_add_bio(hstage, bio);
+}
+
+void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage,
+				 u64 time)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	struct hierarchy_stage *hstage;
+	u64 duration;
+	enum stat_group op;
+
+	if (!blk_mq_hierarchy_registered(q, stage))
+		return;
+
+	op = bio_hierarchy_op(bio);
+	duration = time - bio->hdata->time;
+	hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage];
+
+	hierarchy_remove_bio(hstage, bio);
+	io_hierarchy_inc(hstage->hstats_data, completed, op);
+	io_hierarchy_add(hstage->hstats_data, nsecs, op, duration);
+	hierarchy_account_slow_io_ns(hstage, op, duration);
+}
+
+static enum stat_group rq_hierarchy_op(struct request *rq)
+{
+	if (op_is_discard(rq->cmd_flags))
+		return STAT_DISCARD;
+
+	if (is_flush_rq(rq))
+		return STAT_FLUSH;
+
+	if (op_is_write(rq->cmd_flags))
+		return STAT_WRITE;
+
+	return STAT_READ;
+}
+
+static void rq_hierarchy_warn_unbalanced(struct request *rq,
+					 struct hierarchy_stage *hstage,
+					 enum stage_group old_stage,
+					 enum stage_group new_stage)
+{
+	if (hstage->unbalanced_warned)
+		return;
+
+	pr_warn("blk-io-hierarchy: disk %s stage %d(%s) -> %d(%s) unbalanced accounting.",
+		kobject_name(rq->q->kobj.parent),
+		old_stage, hierarchy_stage_name(old_stage),
+		new_stage, hierarchy_stage_name(new_stage));
+	hstage->unbalanced_warned = true;
+}
+
+void blk_rq_hierarchy_stats_complete(struct request *rq)
+{
+	struct hierarchy_stage *hstage;
+	enum stage_group stage;
+
+	stage = request_to_wrapper(rq)->stage;
+	if (stage == NR_RQ_STAGE_GROUPS)
+		return;
+
+	if (!blk_mq_hierarchy_registered(rq->q, stage))
+		return;
+
+	hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage];
+	rq_hierarchy_warn_unbalanced(rq, hstage, stage, NR_RQ_STAGE_GROUPS);
+	__rq_hierarchy_end_io_acct(rq, hstage);
+}
+
+void __rq_hierarchy_start_io_acct(struct request *rq,
+				  struct hierarchy_stage *hstage)
+{
+	struct request_wrapper *rq_wrapper = request_to_wrapper(rq);
+
+	blk_rq_hierarchy_stats_complete(rq);
+	io_hierarchy_inc(hstage->hstats_data, dispatched, rq_hierarchy_op(rq));
+	WRITE_ONCE(rq_wrapper->hierarchy_time, jiffies);
+
+	/*
+	 * Paired with barrier in hierarchy_show_rq_fn(), make sure
+	 * hierarchy_time is set before stage.
+	 */
+	smp_store_release(&rq_wrapper->stage, hstage->stage);
+}
+EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct);
+
+void __rq_hierarchy_end_io_acct(struct request *rq,
+				struct hierarchy_stage *hstage)
+{
+	enum stat_group op;
+	unsigned long duration;
+	struct request_wrapper *rq_wrapper;
+
+	rq_wrapper = request_to_wrapper(rq);
+	if (rq_wrapper->stage != hstage->stage) {
+		rq_hierarchy_warn_unbalanced(rq, hstage, rq_wrapper->stage,
+					     hstage->stage);
+		return;
+	}
+
+	op = rq_hierarchy_op(rq);
+	duration = jiffies - rq_wrapper->hierarchy_time;
+
+	io_hierarchy_inc(hstage->hstats_data, completed, op);
+	io_hierarchy_add(hstage->hstats_data, jiffies, op, duration);
+	hierarchy_account_slow_io_jiffies(hstage, op, duration);
+	WRITE_ONCE(rq_wrapper->stage, NR_RQ_STAGE_GROUPS);
+}
+EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct);
+
+#ifdef CONFIG_HIERARCHY_BIO
+void bio_hierarchy_start(struct bio *bio)
+{
+	struct request_queue_wrapper *q_wrapper;
+	struct gendisk *disk = bio->bi_disk;
+	struct hierarchy_stage *hstage;
+
+	if (bio_flagged(bio, BIO_HIERARCHY_ACCT))
+		return;
+
+	if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO))
+		return;
+
+	bio_set_flag(bio, BIO_HIERARCHY_ACCT);
+	if (bio_has_data(bio))
+		bio->bi_opf |= REQ_HAS_DATA;
+	q_wrapper = queue_to_wrapper(disk->queue);
+	hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO];
+	io_hierarchy_inc(hstage->hstats_data, dispatched,
+			 bio_hierarchy_op(bio));
+}
+
+void __bio_hierarchy_end(struct bio *bio, u64 now)
+{
+	struct request_queue_wrapper *q_wrapper;
+	struct gendisk *disk = bio->bi_disk;
+	struct hierarchy_stage *hstage;
+	enum stat_group op;
+	u64 duration;
+
+	op = bio_hierarchy_op(bio);
+	duration = now - bio->bi_alloc_time_ns;
+	q_wrapper = queue_to_wrapper(disk->queue);
+	hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO];
+
+	io_hierarchy_inc(hstage->hstats_data, completed, op);
+	io_hierarchy_add(hstage->hstats_data, nsecs, op, duration);
+	hierarchy_account_slow_io_ns(hstage, op, duration);
+
+	bio_clear_flag(bio, BIO_HIERARCHY_ACCT);
+	bio->bi_opf &= ~REQ_HAS_DATA;
+}
+#endif
+
+static int __init hierarchy_stats_init(void)
+{
+	hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT,
+			sizeof(struct bio_hierarchy_data));
+	if (!hdata_pool)
+		panic("Failed to create hdata_pool\n");
+
+	return 0;
+}
+module_init(hierarchy_stats_init);
diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c6a26dfacbd344022f5f062aaf64562a845f75
--- /dev/null
+++ b/block/blk-io-hierarchy/stats.h
@@ -0,0 +1,366 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef BLK_IO_HIERARCHY_STATS_H
+#define BLK_IO_HIERARCHY_STATS_H
+
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include "../blk.h"
+
+struct bio_hierarchy_data {
+	u64 time;
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+	struct bio *bio;
+	struct list_head hierarchy_list;
+#endif
+};
+
+struct hierarchy_stats {
+	union {
+		/* for bio based stages. */
+		u64 nsecs[NR_NEW_STAT_GROUPS];
+		/* for request based stages. */
+		unsigned long jiffies[NR_NEW_STAT_GROUPS];
+	};
+	unsigned long dispatched[NR_NEW_STAT_GROUPS];
+	unsigned long completed[NR_NEW_STAT_GROUPS];
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+	unsigned long slow[NR_NEW_STAT_GROUPS];
+#endif
+};
+
+struct hierarchy_stats_data {
+	int ref;
+	struct hierarchy_stats __percpu *hstats;
+};
+
+struct hierarchy_stage {
+	enum stage_group stage;
+	bool unbalanced_warned;
+	struct dentry *debugfs_dir;
+	struct hierarchy_stats_data *hstats_data;
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+	unsigned long threshold;
+	void *dump_data;
+#endif
+};
+
+struct blk_io_hierarchy_stats {
+	struct request_queue *q;
+	struct dentry *debugfs_dir;
+	spinlock_t hstage_lock;
+	struct hierarchy_stage *hstage[NR_STAGE_GROUPS];
+};
+
+static inline bool stage_is_bio(enum stage_group stage)
+{
+	return stage >= 0 && stage < NR_BIO_STAGE_GROUPS;
+}
+
+static inline bool stage_is_rq(enum stage_group stage)
+{
+	return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS;
+}
+
+const char *hierarchy_stage_name(enum stage_group stage);
+int blk_io_hierarchy_stats_alloc(struct request_queue *q);
+void blk_io_hierarchy_stats_free(struct request_queue *q);
+
+/* APIs for stage registration */
+bool blk_mq_hierarchy_registered(struct request_queue *q,
+				 enum stage_group stage);
+void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage);
+void blk_mq_unregister_hierarchy(struct request_queue *q,
+				 enum stage_group stage);
+
+/* APIs for disk level debugfs */
+void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q);
+void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q);
+
+/* APIs for stage level debugfs */
+void blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+				       enum stage_group stage);
+void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+					 enum stage_group stage);
+struct hierarchy_stats_data *get_hstats_data(
+		struct blk_io_hierarchy_stats *stats,
+		enum stage_group stage);
+void put_hstats_data(struct blk_io_hierarchy_stats *stats,
+		     struct hierarchy_stats_data *hstats_data);
+
+/* APIs for bio based stage io accounting */
+void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage);
+void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage,
+				 u64 time);
+void bio_free_hierarchy_data(struct bio *bio);
+
+static inline void bio_hierarchy_end_io_acct(struct bio *bio,
+					     enum stage_group stage)
+{
+	__bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns());
+}
+
+static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list,
+						  enum stage_group stage)
+{
+	u64 time = blk_time_get_ns();
+	struct bio *bio;
+
+	bio_list_for_each(bio, list)
+		__bio_hierarchy_end_io_acct(bio, stage, time);
+}
+
+/* APIs for request based stage io accounting */
+void blk_rq_hierarchy_stats_complete(struct request *rq);
+void __rq_hierarchy_start_io_acct(struct request *rq,
+				  struct hierarchy_stage *hstage);
+void __rq_hierarchy_end_io_acct(struct request *rq,
+				struct hierarchy_stage *hstage);
+
+static inline void rq_hierarchy_start_io_acct(struct request *rq,
+					      enum stage_group stage)
+{
+	if (!blk_mq_hierarchy_registered(rq->q, stage))
+		return;
+
+	__rq_hierarchy_start_io_acct(rq,
+		queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]);
+}
+
+static inline void rq_hierarchy_end_io_acct(struct request *rq,
+					    enum stage_group stage)
+{
+	if (!blk_mq_hierarchy_registered(rq->q, stage))
+		return;
+
+	__rq_hierarchy_end_io_acct(rq,
+		queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]);
+}
+
+static inline void rq_list_hierarchy_start_io_acct(struct list_head *head,
+						   enum stage_group stage)
+{
+	struct request *rq;
+	struct hierarchy_stage *hstage;
+
+	if (list_empty(head))
+		return;
+
+	rq = list_first_entry(head, struct request, queuelist);
+	if (!blk_mq_hierarchy_registered(rq->q, stage))
+		return;
+
+	hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage];
+	list_for_each_entry(rq, head, queuelist)
+		__rq_hierarchy_start_io_acct(rq, hstage);
+}
+
+static inline void rq_list_hierarchy_end_io_acct(struct list_head *head,
+						 enum stage_group stage)
+{
+	struct request *rq;
+	struct hierarchy_stage *hstage;
+
+	if (list_empty(head))
+		return;
+
+	rq = list_first_entry(head, struct request, queuelist);
+	if (!blk_mq_hierarchy_registered(rq->q, stage))
+		return;
+
+	hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage];
+	list_for_each_entry(rq, head, queuelist)
+		__rq_hierarchy_end_io_acct(rq, hstage);
+}
+
+static inline void blk_rq_hierarchy_stats_init(struct request *rq)
+{
+	request_to_wrapper(rq)->stage = NR_RQ_STAGE_GROUPS;
+	request_to_wrapper(rq)->flush_done = false;
+}
+
+static inline void blk_rq_hierarchy_set_flush_done(struct request *rq)
+{
+	request_to_wrapper(rq)->flush_done = true;
+}
+
+static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq)
+{
+	return request_to_wrapper(rq)->flush_done;
+}
+
+#ifdef CONFIG_HIERARCHY_BIO
+void bio_hierarchy_start(struct bio *bio);
+void __bio_hierarchy_end(struct bio *bio, u64 now);
+
+static inline void bio_hierarchy_end(struct bio *bio)
+{
+	if (!bio_flagged(bio, BIO_HIERARCHY_ACCT))
+		return;
+
+	if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO))
+		return;
+
+	__bio_hierarchy_end(bio, blk_time_get_ns());
+}
+
+static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio)
+{
+	u64 now;
+
+	if (!bio_flagged(bio, BIO_HIERARCHY_ACCT))
+		return;
+
+	if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO))
+		return;
+
+	now = request_to_wrapper(rq)->io_end_time_ns;
+	if (!now) {
+		now = blk_time_get_ns();
+		request_to_wrapper(rq)->io_end_time_ns = now;
+	}
+
+	__bio_hierarchy_end(bio, now);
+}
+#endif
+
+#else /* CONFIG_BLK_IO_HIERARCHY_STATS */
+
+static inline int
+blk_io_hierarchy_stats_alloc(struct request_queue *q)
+{
+	return 0;
+}
+
+static inline void
+blk_io_hierarchy_stats_free(struct request_queue *q)
+{
+}
+
+static inline bool
+blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage)
+{
+	return false;
+}
+
+static inline void
+blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
+{
+}
+
+static inline void
+blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+				  enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+				    enum stage_group stage)
+{
+}
+
+static inline void
+bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage)
+{
+}
+
+static inline void
+bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage)
+{
+}
+
+static inline void
+bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage)
+{
+}
+
+static inline void
+bio_free_hierarchy_data(struct bio *bio)
+{
+}
+
+static inline void
+blk_rq_hierarchy_set_flush_done(struct request *rq)
+{
+}
+
+static inline bool
+blk_rq_hierarchy_is_flush_done(struct request *rq)
+{
+	return false;
+}
+
+static inline void
+blk_rq_hierarchy_stats_complete(struct request *rq)
+{
+}
+
+static inline void
+rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage)
+{
+}
+
+static inline void
+rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage)
+{
+}
+
+static inline void
+rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage)
+{
+}
+
+static inline void
+rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage)
+{
+}
+
+static inline void
+blk_rq_hierarchy_stats_init(struct request *rq)
+{
+}
+
+#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */
+
+#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO)
+static inline void
+bio_hierarchy_start(struct bio *bio)
+{
+}
+
+static inline void
+bio_hierarchy_end(struct bio *bio)
+{
+}
+
+static inline void
+req_bio_hierarchy_end(struct request *rq, struct bio *bio)
+{
+}
+#endif
+
+#endif /* BLK_IO_HIERARCHY_STATS_H */
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 1baa3c49e2e3b8386b900b9f18f9469d7cfc45d5..6f81794eb6e6d6cb0790ba7e13b1f201e13d7284 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -557,7 +557,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 	struct rq_wait *rqw;
 	struct iolatency_grp *iolat;
 	u64 window_start;
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 	bool issue_as_root = bio_issue_as_root_blkg(bio);
 	bool enabled = false;
 	int inflight = 0;
@@ -624,7 +624,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
 	struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
 	struct blkcg_gq *blkg;
 	struct cgroup_subsys_state *pos_css;
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(blkg, pos_css,
@@ -895,7 +895,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
 	struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d2fabe1fdf3264d9cf8bb47888da7a30a6a8ccb6..9f9d803e064b05b81abdc8052e2253dd65e973a5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -784,6 +784,7 @@ static struct request *attempt_merge(struct request_queue *q,
 	req->biotail = next->biotail;
 
 	req->__data_len += blk_rq_bytes(next);
+	blk_rq_update_bi_alloc_time(req, NULL, next);
 
 	if (!blk_discard_mergable(req))
 		elv_merge_requests(q, req, next);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f0865b6ea1e19fb723246eedcdc5fdcba5607303..5ee91901d9ccc1f3b9da7dd6bcaa12e38c6fb833 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -23,6 +23,7 @@
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
+#include "blk-io-hierarchy/stats.h"
 
 static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 {
@@ -355,9 +356,13 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
 	return blk_mq_rq_state_name_array[rq_state];
 }
 
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
+/*
+ * This helper will dump general information for @rq into @m, started with '{'
+ * and doesn't end with '}', caller must include a closing curly brace '}' at
+ * the end after adding the custom string.
+ */
+void debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
-	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
 	const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
 
 	seq_printf(m, "%p {.op=", rq);
@@ -374,6 +379,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
+}
+
+int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
+{
+	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
+
+	debugfs_rq_show(m, rq);
 	if (mq_ops->show_rq)
 		mq_ops->show_rq(m, rq);
 	seq_puts(m, "}\n");
@@ -811,8 +823,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{},
 };
 
-static bool debugfs_create_files(struct dentry *parent, void *data,
-				 const struct blk_mq_debugfs_attr *attr)
+bool debugfs_create_files(struct dentry *parent, void *data,
+			  const struct blk_mq_debugfs_attr *attr)
 {
 	if (IS_ERR_OR_NULL(parent))
 		return false;
@@ -861,6 +873,7 @@ int blk_mq_debugfs_register(struct request_queue *q)
 			goto err;
 	}
 
+	blk_mq_debugfs_register_hierarchy_stats(q);
 	return 0;
 
 err:
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a9160be12be05a527c0d968a6534e4a244f989e1..70549712b0a270d7774a67f8e03f701a08966e10 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -15,6 +15,7 @@ struct blk_mq_debugfs_attr {
 	const struct seq_operations *seq_ops;
 };
 
+void debugfs_rq_show(struct seq_file *m, struct request *rq);
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
 int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
 
@@ -31,6 +32,15 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q);
 int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx);
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
+
+bool debugfs_create_files(struct dentry *parent, void *data,
+			  const struct blk_mq_debugfs_attr *attr);
+
+static inline bool blk_mq_debugfs_enabled(struct request_queue *q)
+{
+	return !IS_ERR_OR_NULL(q->debugfs_dir);
+}
+
 #else
 static inline int blk_mq_debugfs_register(struct request_queue *q)
 {
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0fb33abac3f62fe86c28d913bdcf1bb7ecfd45ea..1c8befbe7b69842de0bcd2629100ebaa52189c6f 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -15,6 +15,7 @@
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
 
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
 				 void (*exit)(struct blk_mq_hw_ctx *))
@@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
+		rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
 				ret = blk_mq_do_dispatch_sched(hctx);
@@ -389,10 +391,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	bool ret = false;
 
-	if (e && e->type->ops.mq.bio_merge) {
-		blk_mq_put_ctx(ctx);
+	if (e && e->type->ops.mq.bio_merge)
 		return e->type->ops.mq.bio_merge(hctx, bio);
-	}
 
 	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
 			!list_empty_careful(&ctx->rq_list)) {
@@ -402,7 +402,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 		spin_unlock(&ctx->lock);
 	}
 
-	blk_mq_put_ctx(ctx);
 	return ret;
 }
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index bee92ab06a5e33aab4039a63efebf943b1288a32..f7b21d7f136e77d2f6620c4a04dc469b4b61fbf3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-io-hierarchy/stats.h"
 
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
@@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	unsigned int tag_offset;
-	bool drop_ctx;
 	int tag;
 
 	if (data->flags & BLK_MQ_REQ_RESERVED) {
@@ -135,8 +135,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return BLK_MQ_TAG_FAIL;
 
+	if (data->bio)
+		bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG);
 	ws = bt_wait_ptr(bt, data->hctx);
-	drop_ctx = data->ctx == NULL;
 	do {
 		struct sbitmap_queue *bt_prev;
 
@@ -162,9 +163,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		if (data->ctx)
-			blk_mq_put_ctx(data->ctx);
-
 		bt_prev = bt;
 		io_schedule();
 
@@ -189,10 +187,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
-	if (drop_ctx && data->ctx)
-		blk_mq_put_ctx(data->ctx);
-
 	finish_wait(&ws->wait, &wait);
+	if (data->bio)
+		bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG);
 
 found_tag:
 	return tag + tag_offset;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index aa4b3c6082496bbe476e12c2fe16b6063e0c1bb3..8502e7495d58672477fd9e3bdfcdf0d07b12881b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -36,6 +36,7 @@
 #include "blk-stat.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
 
 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -366,8 +367,13 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-	rq->start_time_ns = ktime_get_ns();
+	rq->start_time_ns = blk_time_get_ns();
+	blk_rq_init_bi_alloc_time(rq, NULL);
+	blk_mq_get_alloc_task(rq, data->bio);
+	blk_rq_hierarchy_stats_init(rq);
+
 	rq->io_start_time_ns = 0;
+	request_to_wrapper(rq)->io_end_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -400,13 +406,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
-	bool put_ctx_on_error = false;
+	bool clear_ctx_on_error = false;
 
 	blk_queue_enter_live(q);
 	data->q = q;
 	if (likely(!data->ctx)) {
 		data->ctx = blk_mq_get_ctx(q);
-		put_ctx_on_error = true;
+		clear_ctx_on_error = true;
 	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
@@ -430,10 +436,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 	tag = blk_mq_get_tag(data);
 	if (tag == BLK_MQ_TAG_FAIL) {
-		if (put_ctx_on_error) {
-			blk_mq_put_ctx(data->ctx);
+		if (clear_ctx_on_error)
 			data->ctx = NULL;
-		}
 		blk_queue_exit(q);
 		return NULL;
 	}
@@ -470,8 +474,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	if (!rq)
 		return ERR_PTR(-EWOULDBLOCK);
 
-	blk_mq_put_ctx(alloc_data.ctx);
-
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
@@ -532,6 +534,8 @@ static void __blk_mq_free_request(struct request *rq)
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	const int sched_tag = rq->internal_tag;
 
+	blk_rq_hierarchy_stats_complete(rq);
+	blk_mq_put_alloc_task(rq);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -576,13 +580,22 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
-	u64 now = ktime_get_ns();
+	u64 now = request_to_wrapper(rq)->io_end_time_ns;
+
+	if (!now)
+		now = blk_time_get_ns();
 
 	if (rq->rq_flags & RQF_STATS) {
 		blk_mq_poll_stats_start(rq->q);
 		blk_stat_add(rq, now);
 	}
 
+	/*
+	 * Avoid accounting flush request with data twice and request that is
+	 * not started.
+	 */
+	if (blk_mq_request_started(rq) && !blk_rq_hierarchy_is_flush_done(rq))
+		rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER);
 	blk_account_io_done(rq, now);
 
 	if (rq->end_io) {
@@ -722,9 +735,10 @@ void blk_mq_start_request(struct request *rq)
 	blk_mq_sched_started_request(rq);
 
 	trace_block_rq_issue(q, rq);
+	rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		rq->io_start_time_ns = ktime_get_ns();
+		rq->io_start_time_ns = blk_time_get_ns();
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 		rq->throtl_size = blk_rq_sectors(rq);
 #endif
@@ -760,6 +774,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	if (blk_mq_request_started(rq)) {
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 		rq->rq_flags &= ~RQF_TIMED_OUT;
+		rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -787,6 +802,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
 	spin_lock_irq(&q->requeue_lock);
 	list_splice_init(&q->requeue_list, &rq_list);
 	spin_unlock_irq(&q->requeue_lock);
+	rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE);
 
 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 		if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
@@ -826,6 +842,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 	 */
 	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
+	rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE);
 	spin_lock_irqsave(&q->requeue_lock, flags);
 	if (at_head) {
 		rq->rq_flags |= RQF_SOFTBARRIER;
@@ -1317,6 +1334,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	if (!list_empty(list)) {
 		bool needs_restart;
 
+		rq_list_hierarchy_start_io_acct(list, STAGE_HCTX);
 		spin_lock(&hctx->lock);
 		list_splice_tail_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
@@ -1726,6 +1744,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
 
+	rq_hierarchy_start_io_acct(rq, STAGE_HCTX);
 	spin_lock(&hctx->lock);
 	if (at_head)
 		list_add(&rq->queuelist, &hctx->dispatch);
@@ -1792,6 +1811,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
 				trace_block_unplug(this_q, depth, !from_schedule);
+				rq_list_hierarchy_end_io_acct(&ctx_list,
+							      STAGE_PLUG);
 				blk_mq_sched_insert_requests(this_q, this_ctx,
 								&ctx_list,
 								from_schedule);
@@ -1812,6 +1833,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (this_ctx) {
 		trace_block_unplug(this_q, depth, !from_schedule);
+		rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG);
 		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
 						from_schedule);
 	}
@@ -1975,7 +1997,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
-	struct blk_mq_alloc_data data = { .flags = 0 };
+	struct blk_mq_alloc_data data = {
+		.flags	= 0,
+		.bio	= bio
+	};
 	struct request *rq;
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
@@ -1991,6 +2016,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_split(q, &bio);
 
+	/* account for split bio. */
+	bio_hierarchy_start(bio);
+
 	if (!bio_integrity_prep(bio))
 		return BLK_QC_T_NONE;
 
@@ -2019,7 +2047,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	plug = current->plug;
 	if (unlikely(is_flush_fua)) {
-		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 
 		/* bypass scheduler for flush rq */
@@ -2028,7 +2055,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	} else if (plug && q->nr_hw_queues == 1) {
 		struct request *last = NULL;
 
-		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
@@ -2051,6 +2077,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 			trace_block_plug(q);
 		}
 
+		rq_hierarchy_start_io_acct(rq, STAGE_PLUG);
 		list_add_tail(&rq->queuelist, &plug->mq_list);
 	} else if (plug && !blk_queue_nomerges(q)) {
 		blk_mq_bio_to_request(rq, bio);
@@ -2066,23 +2093,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 			same_queue_rq = NULL;
 		if (same_queue_rq)
 			list_del_init(&same_queue_rq->queuelist);
+		rq_hierarchy_start_io_acct(rq, STAGE_PLUG);
 		list_add_tail(&rq->queuelist, &plug->mq_list);
 
-		blk_mq_put_ctx(data.ctx);
-
 		if (same_queue_rq) {
 			data.hctx = blk_mq_map_queue(q,
 					same_queue_rq->mq_ctx->cpu);
+			rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG);
 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
 					&cookie);
 		}
 	} else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
 			!data.hctx->dispatch_busy)) {
-		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
 	} else {
-		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_sched_insert_request(rq, false, true, true);
 	}
@@ -2240,7 +2265,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
-	rq_size = round_up(sizeof(struct request) + set->cmd_size,
+	rq_size = round_up(sizeof(struct request) +
+			   sizeof(struct request_wrapper) + set->cmd_size,
 				cache_line_size());
 	left = rq_size * depth;
 
@@ -2281,7 +2307,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		to_do = min(entries_per_page, depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
-			struct request *rq = p;
+			struct request *rq = p + sizeof(struct request_wrapper);
 
 			tags->static_rqs[i] = rq;
 			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
@@ -2324,6 +2350,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	if (list_empty(&tmp))
 		return 0;
 
+	rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX);
 	spin_lock(&hctx->lock);
 	list_splice_tail_init(&tmp, &hctx->dispatch);
 	spin_unlock(&hctx->lock);
@@ -2758,6 +2785,9 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx, *next;
 	int i;
 
+	blk_mq_unregister_hierarchy(q, STAGE_BIO);
+	blk_io_hierarchy_stats_free(q);
+
 	queue_for_each_hw_ctx(q, hctx, i)
 		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
 
@@ -2895,14 +2925,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
+	if (blk_io_hierarchy_stats_alloc(q))
+		goto err_exit;
+
 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
 					     blk_mq_poll_stats_bkt,
 					     BLK_MQ_POLL_STATS_BKTS, q);
 	if (!q->poll_cb)
-		goto err_exit;
+		goto err_hierarchy_exit;
 
 	if (blk_mq_alloc_ctxs(q))
-		goto err_exit;
+		goto err_hierarchy_exit;
 
 	/* init q->mq_kobj and sw queues' kobjects */
 	blk_mq_sysfs_init(q);
@@ -2972,6 +3005,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->nr_hw_queues = 0;
 err_sys_init:
 	blk_mq_sysfs_deinit(q);
+err_hierarchy_exit:
+	blk_io_hierarchy_stats_free(q);
 err_exit:
 	q->mq_ops = NULL;
 	return ERR_PTR(-ENOMEM);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c6ec9aa12fb2a11f488a0d2dfb9e532bea3b4482..80ad4bc91fa283a634172004a3219872a8b2921f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,6 +36,26 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
+struct request_wrapper {
+	u64 io_end_time_ns;
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME
+	u64 bi_alloc_time_ns;
+#endif
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK
+	struct pid *pid;
+#endif
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+	bool flush_done;
+	enum stage_group stage;
+	unsigned long hierarchy_time;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static inline struct request_wrapper *request_to_wrapper(void *rq)
+{
+	return rq - sizeof(struct request_wrapper);
+}
+
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -125,12 +145,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  */
 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
 {
-	return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
-	put_cpu();
+	return __blk_mq_get_ctx(q, raw_smp_processor_id());
 }
 
 struct blk_mq_alloc_data {
@@ -142,6 +157,10 @@ struct blk_mq_alloc_data {
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
+
+#ifndef __GENKSYMS__
+	struct bio *bio;
+#endif
 };
 
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
@@ -234,4 +253,29 @@ static inline void blk_mq_free_requests(struct list_head *list)
 	}
 }
 
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK
+static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio)
+{
+	request_to_wrapper(rq)->pid = bio ? get_pid(bio->pid) :
+					    get_pid(task_pid(current));
+}
+
+static inline void blk_mq_put_alloc_task(struct request *rq)
+{
+	struct request_wrapper *rq_wrapper = request_to_wrapper(rq);
+
+	if (rq_wrapper->pid) {
+		put_pid(rq_wrapper->pid);
+		rq_wrapper->pid = NULL;
+	}
+}
+#else
+static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio)
+{
+}
+static inline void blk_mq_put_alloc_task(struct request *rq)
+{
+}
+#endif
+
 #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1c4d795bbdc47df56b39a71aa4a20640f1e1a2fb..719687a394eaf6afa7fd32ab664deb1df892b82a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -17,6 +17,7 @@
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -924,6 +925,19 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void blk_mq_register_default_hierarchy(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		return;
+
+	blk_mq_register_hierarchy(q, STAGE_GETTAG);
+	blk_mq_register_hierarchy(q, STAGE_PLUG);
+	blk_mq_register_hierarchy(q, STAGE_HCTX);
+	blk_mq_register_hierarchy(q, STAGE_REQUEUE);
+	blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER);
+	blk_mq_register_hierarchy(q, STAGE_BIO);
+}
+
 /**
  * blk_register_queue - register a block layer queue with sysfs
  * @disk: Disk of which the request queue should be registered with sysfs.
@@ -973,6 +987,8 @@ int blk_register_queue(struct gendisk *disk)
 		has_elevator = true;
 	}
 
+	blk_mq_register_default_hierarchy(q);
+
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(q);
 	blk_throtl_register_queue(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5981912865572f455f63750d009473f0a0a372fc..a1867a2f4f181c164a163b75049f402fef6ae0ad 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -14,6 +14,7 @@
 #include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include "blk.h"
+#include "blk-io-hierarchy/stats.h"
 
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
@@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
+	bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE);
+
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
@@ -1910,7 +1913,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
 	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
 	ret = tg->latency_target == DFL_LATENCY_TARGET ||
 	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
-	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+	      (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
 	      tg->avg_idletime > tg->idletime_threshold ||
 	      (tg->latency_target && tg->bio_cnt &&
 		tg->bad_bio_cnt * 5 < tg->bio_cnt);
@@ -2140,7 +2143,7 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 
 static void blk_throtl_update_idletime(struct throtl_grp *tg)
 {
-	unsigned long now = ktime_get_ns() >> 10;
+	unsigned long now = blk_time_get_ns() >> 10;
 	unsigned long last_finish_time = tg->last_finish_time;
 
 	if (now <= last_finish_time || last_finish_time == 0 ||
@@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
+	/*
+	 * This is slow path now, bio_hierarchy_start_io_acct() might spend
+	 * some time to allocate memory. However, it's safe because 'tg' is
+	 * pinned by this bio, and io charge should still be accurate because
+	 * slice is already started from tg_may_dispatch().
+	 */
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+
+	bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE);
+
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
+
 	td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
@@ -2403,7 +2420,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 		return;
 	tg = blkg_to_tg(blkg);
 
-	finish_time_ns = ktime_get_ns();
+	finish_time_ns = blk_time_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
 	start_time = bio_issue_time(&bio->bi_issue) >> 10;
@@ -2505,6 +2522,8 @@ void blk_throtl_drain(struct request_queue *q)
 			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
+	bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE);
+
 	if (!bio_list_empty(&bio_list_on_stack))
 		while ((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
@@ -2561,6 +2580,8 @@ void blk_throtl_exit(struct request_queue *q)
 	del_timer_sync(&q->td->service_queue.pending_timer);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+	blk_mq_unregister_hierarchy(q, STAGE_THROTTLE);
+
 	free_percpu(q->td->latency_buckets[READ]);
 	free_percpu(q->td->latency_buckets[WRITE]);
 	kfree(q->td);
@@ -2593,6 +2614,8 @@ void blk_throtl_register_queue(struct request_queue *q)
 	td->track_bio_latency = !queue_is_rq_based(q);
 	if (!td->track_bio_latency)
 		blk_stat_enable_accounting(q);
+
+	blk_mq_register_hierarchy(q, STAGE_THROTTLE);
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 94b5eff0cd3aea56cae5be436b49da1ca428feac..cf098d2a7262c094345d386f7da8cda87900de0b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,10 +25,14 @@
 #include <linux/swap.h>
 #ifndef __GENKSYMS__
 #include <linux/blk-mq.h>
+#include "blk.h"
 #endif
 
 #include "blk-wbt.h"
 #include "blk-rq-qos.h"
+#ifndef __GENKSYMS__
+#include "blk-io-hierarchy/stats.h"
+#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
@@ -223,7 +227,7 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
 	if (!issue || !rwb->sync_cookie)
 		return 0;
 
-	now = ktime_to_ns(ktime_get());
+	now = blk_time_get_ns();
 	return now - issue;
 }
 
@@ -533,11 +537,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  * the timer to kick off queuing again.
  */
 static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
-		       unsigned long rw, spinlock_t *lock)
+		       struct bio *bio, spinlock_t *lock)
 	__releases(lock)
 	__acquires(lock)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
+	unsigned long rw = bio->bi_opf;
 	struct wbt_wait_data data = {
 		.wq = {
 			.func	= wbt_wake_function,
@@ -554,6 +559,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 	if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
 		return;
 
+	bio_hierarchy_start_io_acct(bio, STAGE_WBT);
 	has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq,
 						 TASK_UNINTERRUPTIBLE);
 	do {
@@ -588,6 +594,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 	} while (1);
 
 	finish_wait(&rqw->wait, &data.wq);
+	bio_hierarchy_end_io_acct(bio, STAGE_WBT);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -652,7 +659,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
 		return;
 	}
 
-	__wbt_wait(rwb, flags, bio->bi_opf, lock);
+	__wbt_wait(rwb, flags, bio, lock);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
@@ -770,6 +777,7 @@ static void wbt_exit(struct rq_qos *rqos)
 	struct rq_wb *rwb = RQWB(rqos);
 	struct request_queue *q = rqos->q;
 
+	blk_mq_unregister_hierarchy(q, STAGE_WBT);
 	blk_stat_remove_callback(q, rwb->cb);
 	blk_stat_free_callback(rwb->cb);
 	kfree(rwb);
@@ -848,6 +856,7 @@ int wbt_init(struct request_queue *q)
 
 	blk_mq_unfreeze_queue(q);
 	wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+	blk_mq_register_hierarchy(q, STAGE_WBT);
 
 	return 0;
 }
diff --git a/block/blk.h b/block/blk.h
index 965e9c507654e0dac626289eabffb883e20b16f1..99a57be837654b0722158d73f1f07c75886ea932 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,10 +55,13 @@ struct request_queue_wrapper {
 	int __percpu		*last_dispatch_cpu;
 #endif
 	struct mutex		sysfs_dir_lock;
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+	struct blk_io_hierarchy_stats *io_hierarchy_stats;
+#endif
 };
 
-#define queue_to_wrapper(q) \
-	container_of(q, struct request_queue_wrapper, q)
+#define queue_to_wrapper(__q) \
+	container_of((__q), struct request_queue_wrapper, q)
 
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
@@ -147,6 +150,64 @@ static inline void __blk_get_queue(struct request_queue *q)
 	kobject_get(&q->kobj);
 }
 
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME
+static inline u64 blk_time_get_ns(void);
+static inline void blk_rq_init_bi_alloc_time(struct request *rq,
+					     struct request *first_rq)
+{
+	if (!rq->q->mq_ops)
+		return;
+
+	request_to_wrapper(rq)->bi_alloc_time_ns =
+		first_rq ? request_to_wrapper(first_rq)->bi_alloc_time_ns :
+			   blk_time_get_ns();
+}
+
+/*
+ * Used in following cases to updated request bi_alloc_time_ns:
+ *
+ * 1) Allocate a new @rq for @bio;
+ * 2) @bio is merged to @rq, in this case @merged_rq should be NULL;
+ * 3) @merged_rq is merged to @rq, in this case @bio should be NULL;
+ */
+static inline void blk_rq_update_bi_alloc_time(struct request *rq,
+					       struct bio *bio,
+					       struct request *merged_rq)
+{
+	struct request_wrapper *rq_wrapper;
+	struct request_wrapper *merged_rq_wrapper;
+
+	if (!rq->q->mq_ops)
+		return;
+
+	rq_wrapper = request_to_wrapper(rq);
+	if (bio) {
+		if (rq_wrapper->bi_alloc_time_ns > bio->bi_alloc_time_ns)
+			rq_wrapper->bi_alloc_time_ns = bio->bi_alloc_time_ns;
+		return;
+	}
+
+	if (WARN_ON_ONCE(!merged_rq))
+		return;
+
+	merged_rq_wrapper = request_to_wrapper(merged_rq);
+	if (rq_wrapper->bi_alloc_time_ns > merged_rq_wrapper->bi_alloc_time_ns)
+		rq_wrapper->bi_alloc_time_ns =
+			merged_rq_wrapper->bi_alloc_time_ns;
+}
+#else /* CONFIG_BLK_BIO_ALLOC_TIME */
+static inline void blk_rq_init_bi_alloc_time(struct request *rq,
+					     struct request *first_rq)
+{
+}
+
+static inline void blk_rq_update_bi_alloc_time(struct request *rq,
+					       struct bio *bio,
+					       struct request *merged_rq)
+{
+}
+#endif /* CONFIG_BLK_BIO_ALLOC_TIME */
+
 bool is_flush_rq(struct request *req);
 
 struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
@@ -479,4 +540,28 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q)
 }
 #endif
 
+static inline u64 blk_time_get_ns(void)
+{
+	struct task_struct *tsk = current;
+	struct blk_plug *plug = tsk->plug;
+
+	if (!plug || !in_task())
+		return ktime_get_ns();
+
+	/*
+	 * 0 could very well be a valid time, but rather than flag "this is
+	 * a valid timestamp" separately, just accept that we'll do an extra
+	 * ktime_get_ns() if we just happen to get 0 as the current time.
+	 */
+	if (!tsk->_resvd->cur_ktime) {
+		tsk->_resvd->cur_ktime = ktime_get_ns();
+		tsk->flags |= PF_BLOCK_TS;
+	}
+	return tsk->_resvd->cur_ktime;
+}
+
+static inline ktime_t blk_time_get(void)
+{
+	return ns_to_ktime(blk_time_get_ns());
+}
 #endif /* BLK_INTERNAL_H */
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 833e9eaae640bf74612599ba5230ece52a02af56..fe0cb5ab76af6636af7b3feed634c2110d30609f 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -30,6 +30,7 @@
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
+#include "blk-io-hierarchy/stats.h"
 
 /* Scheduling domains. */
 enum {
@@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	blk_stat_add_callback(q, kqd->cb);
 
+	blk_mq_register_hierarchy(q, STAGE_KYBER);
 	return 0;
 }
 
@@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
 	struct request_queue *q = kqd->q;
 	int i;
 
+	blk_mq_unregister_hierarchy(q, STAGE_KYBER);
 	blk_stat_remove_callback(q, kqd->cb);
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
@@ -517,7 +520,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio)
 	spin_lock(&kcq->lock);
 	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
 	spin_unlock(&kcq->lock);
-	blk_mq_put_ctx(ctx);
 
 	return merged;
 }
@@ -533,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 	struct kyber_hctx_data *khd = hctx->sched_data;
 	struct request *rq, *next;
 
+	rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER);
 	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
 		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
 		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
@@ -584,7 +587,7 @@ static void kyber_completed_request(struct request *rq)
 	if (blk_stat_is_active(kqd->cb))
 		return;
 
-	now = ktime_get_ns();
+	now = blk_time_get_ns();
 	if (now < rq->io_start_time_ns)
 		return;
 
@@ -772,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	rq = NULL;
 out:
 	spin_unlock(&khd->lock);
+
+	if (rq)
+		rq_hierarchy_end_io_acct(rq, STAGE_KYBER);
 	return rq;
 }
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 7ad820050675395857cd139d65f44f49135b5668..aa51abb3eaa4e7c6bf3615d120f19ca299319d74 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -22,6 +22,7 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
+#include "blk-io-hierarchy/stats.h"
 
 /*
  * See Documentation/block/deadline-iosched.txt
@@ -61,6 +62,8 @@ struct deadline_data {
 	spinlock_t lock;
 	spinlock_t zone_lock;
 	struct list_head dispatch;
+
+	struct request_queue *q;
 };
 
 static inline struct rb_root *
@@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	rq = __dd_dispatch_request(dd);
 	spin_unlock(&dd->lock);
 
+	if (rq)
+		rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE);
 	return rq;
 }
 
@@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e)
 	BUG_ON(!list_empty(&dd->fifo_list[READ]));
 	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
 
+	blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE);
 	kfree(dd);
 }
 
@@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
 	dd->writes_starved = writes_starved;
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
+	dd->q = q;
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 	INIT_LIST_HEAD(&dd->dispatch);
 
 	q->elevator = eq;
+	blk_mq_register_hierarchy(q, STAGE_DEADLINE);
 	return 0;
 }
 
@@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 	ret = blk_mq_sched_try_merge(q, bio, &free);
 	spin_unlock(&dd->lock);
 
-	if (free)
+	if (free) {
+		rq_hierarchy_end_io_acct(free, STAGE_DEADLINE);
 		blk_mq_free_request(free);
+	}
 
 	return ret;
 }
@@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_req_zone_write_unlock(rq);
 
 	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+		rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE);
 		blk_mq_free_requests(&free);
 		return;
 	}
@@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 
+	rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE);
+
 	spin_lock(&dd->lock);
 	while (!list_empty(list)) {
 		struct request *rq;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8075b9955bb3cfd00132a618afe5d87ad46d718b..f77227229f58d7a5aa1a516fde7233e070f4c022 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -203,9 +203,21 @@ struct bio {
 
 	struct bio_set		*bi_pool;
 
+#if defined(CONFIG_BLK_BIO_ALLOC_TIME) && !defined(__GENKSYMS__)
+	u64			bi_alloc_time_ns;
+#else
 	KABI_RESERVE(1)
+#endif
+#if defined(CONFIG_BLK_BIO_ALLOC_TASK) && !defined(__GENKSYMS__)
+	struct pid		*pid;
+#else
 	KABI_RESERVE(2)
+#endif
+#if defined(CONFIG_BLK_IO_HIERARCHY_STATS) && !defined(__GENKSYMS__)
+	struct bio_hierarchy_data *hdata;
+#else
 	KABI_RESERVE(3)
+#endif
 
 	/*
 	 * We can inline a number of vecs at the end of the bio, to avoid
@@ -220,6 +232,12 @@ struct bio {
 /*
  * bio flags
  */
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+#define BIO_HIERARCHY_ACCT 0	/*
+				 * This bio has already been subjected to
+				 * blk-io-hierarchy, don't do it again.
+				 */
+#endif
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */
@@ -334,6 +352,9 @@ enum req_flag_bits {
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+	_REQ_HAS_DATA,		/* io contain data. */
+#endif
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -356,6 +377,9 @@ enum req_flag_bits {
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+#define REQ_HAS_DATA		(1UL << _REQ_HAS_DATA)
+#endif
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -368,7 +392,36 @@ enum stat_group {
 	STAT_WRITE,
 	STAT_DISCARD,
 
-	NR_STAT_GROUPS
+	NR_STAT_GROUPS,
+	STAT_FLUSH = NR_STAT_GROUPS,
+	NR_NEW_STAT_GROUPS,
+};
+
+enum stage_group {
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	STAGE_THROTTLE,
+#endif
+#ifdef CONFIG_BLK_WBT
+	STAGE_WBT,
+#endif
+	STAGE_GETTAG,
+	NR_BIO_STAGE_GROUPS,
+	STAGE_PLUG = NR_BIO_STAGE_GROUPS,
+#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE)
+	STAGE_DEADLINE,
+#endif
+#if IS_ENABLED(CONFIG_IOSCHED_BFQ)
+	STAGE_BFQ,
+#endif
+#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER)
+	STAGE_KYBER,
+#endif
+	STAGE_HCTX,
+	STAGE_REQUEUE,
+	STAGE_RQ_DRIVER,
+	NR_RQ_STAGE_GROUPS,
+	STAGE_BIO = NR_RQ_STAGE_GROUPS,
+	NR_STAGE_GROUPS,
 };
 
 #define bio_op(bio) \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c848f4205729ec722877f2b324ed68a890e50546..241f59eb5b64ad04e579312cf01578798063650b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1435,6 +1435,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 	return bqt->tag_index[tag];
 }
 
+/*
+ * tsk == current here
+ */
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+	struct blk_plug *plug = tsk->plug;
+
+	if (plug)
+		current->_resvd->cur_ktime = 0;
+	current->flags &= ~PF_BLOCK_TS;
+}
+
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
@@ -2150,6 +2162,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	return false;
 }
 
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+}
+
 static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 				     sector_t *error_sector)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26255b76ca525abe952c8932722a8348cd7fb004..d2eceea955b0968d6d90ef2cbb0a55f0386fd974 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -635,6 +635,7 @@ struct task_struct_resvd {
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
 	struct sched_grid_qos	*grid_qos;
 #endif
+	u64 cur_ktime;
 };
 
 struct task_struct {
@@ -1495,6 +1496,7 @@ extern struct pid *cad_pid;
 #define PF_UCE_KERNEL_RECOVERY	0x02000000	/* Task in uce kernel recovery state */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
+#define PF_BLOCK_TS		0x10000000      /* plug has ts that needs updating */
 #define PF_IO_WORKER		0x20000000	/* Task is an IO worker */
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe9f91f39e2fb6dd9e9ac1718560e93f675d33e1..e37428598155eafe2ef7c233c1d887d585407e7a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3581,10 +3581,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 
 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+		if (tsk->flags & PF_BLOCK_TS)
+			blk_plug_invalidate_ts(tsk);
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
-		else
+		else if (tsk->flags & PF_IO_WORKER)
 			io_wq_worker_running(tsk);
 	}
 }