diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 394fb18c9e3dc1cb9ebdaddf77dd8a6ad9395068..20c35c289253b3d1348e02b009a185cffbe8f8a7 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -951,6 +951,12 @@ All cgroup core files are prefixed with "cgroup."
 	it's possible to delete a frozen (and empty) cgroup, as well as
 	create new sub-cgroups.
 
+irq.pressure
+        A read-write nested-keyed file.
+
+        Shows pressure stall information for IRQ/SOFTIRQ. See
+        :ref:`Documentation/accounting/psi.rst <psi>` for details.
+
 Controllers
 ===========
 
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 34e593350ee775389bc345e84aacf7f363dd2460..7f3daf89bb8edc220cc82a4c19150342d0c01f23 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -104,6 +104,8 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_PSI=y
 CONFIG_PSI_DEFAULT_DISABLED=y
+CONFIG_PSI_CGROUP_V1=y
+CONFIG_PSI_FINE_GRAINED=y
 # end of CPU/Task time and stats accounting
 
 CONFIG_CPU_ISOLATION=y
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index 82e806651b996b4c117eb2b0e226b1ac928b3f2b..28920c2ed40b45529b0f3f37746420c24af841ec 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -108,6 +108,8 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_PSI=y
 CONFIG_PSI_DEFAULT_DISABLED=y
+CONFIG_PSI_CGROUP_V1=y
+CONFIG_PSI_FINE_GRAINED=y
 # end of CPU/Task time and stats accounting
 
 CONFIG_CPU_ISOLATION=y
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 92ce202bd8e5fbdc31d4cd108d43793df49cc5c8..1f2c93e9daa11ce08f1b78726b0567d8da3e63f1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1689,7 +1689,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
  */
 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
 {
-	unsigned long pflags;
+	unsigned long pflags = 0;
 	bool clamp;
 	u64 now = ktime_to_ns(ktime_get());
 	u64 exp;
diff --git a/block/blk-core.c b/block/blk-core.c
index 01f0782668ce76a3886421d1a251b923d745548b..71d60ec24a8a8fb40fd130ef49f3db0e0c6cdefa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1116,7 +1116,7 @@ blk_qc_t submit_bio(struct bio *bio)
 	 */
 	if (unlikely(bio_op(bio) == REQ_OP_READ &&
 	    bio_flagged(bio, BIO_WORKINGSET))) {
-		unsigned long pflags;
+		unsigned long pflags = 0;
 		blk_qc_t ret;
 
 		psi_memstall_enter(&pflags);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 47263cecb12f4c099829c162ef6d28cda488d943..09f2d58d119b168555d1d83082b1f56c7b9dfad2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -492,9 +492,9 @@ struct cgroup {
 	/*
 	 * It is accessed only the cgroup core code and so changes made to
 	 * the cgroup structure should not affect third-party kernel modules.
+	 * The member is unused now.
 	 */
-	struct psi_group psi;
-
+	KABI_DEPRECATE(struct psi_group, psi)
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
 
@@ -504,7 +504,7 @@ struct cgroup {
 	/* Used to store internal freezer state */
 	struct cgroup_freezer_state freezer;
 
-	KABI_RESERVE(1)
+	KABI_USE(1, struct psi_group *psi)
 	KABI_RESERVE(2)
 	KABI_RESERVE(3)
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e706ff15ec883e0ff4f7a4346e08bdd9f3720eb6..5b8089c6b3207e56e7802b7918f9c1fb25ff56de 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -675,7 +675,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 
 static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 {
-	return &cgrp->psi;
+	return cgrp->psi;
 }
 
 static inline void cgroup_init_kthreadd(void)
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 86635a5630bab6265cf38bbd0eb1fe5798acfb1c..55bb63a4fd6530526fd08c784b84052dcdbc5d38 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -14,13 +14,16 @@ struct css_set;
 extern struct static_key_false psi_disabled;
 extern struct psi_group psi_system;
 
-extern struct static_key_false psi_v1_disabled;
+#ifdef CONFIG_PSI_CGROUP_V1
+extern struct static_key_true psi_v1_disabled;
+#endif
 
 void psi_init(void);
 
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
 
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
@@ -34,6 +37,10 @@ void psi_trigger_destroy(struct psi_trigger *t);
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+int psi_stat_show(struct seq_file *s, struct psi_group *group);
+#endif
+
 #ifdef CONFIG_CGROUPS
 int psi_cgroup_alloc(struct cgroup *cgrp);
 void psi_cgroup_free(struct cgroup *cgrp);
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0b6e17e7f84f050a6f3bdb5a51ad35bd9a1c3a02..bdefb0b1cd80beb9ba5cec168905bf962f947d27 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -36,13 +36,6 @@ enum psi_task_count {
 	NR_IOWAIT,
 	NR_MEMSTALL,
 	NR_RUNNING,
-	/*
-	 * This can't have values other than 0 or 1 and could be
-	 * implemented as a bit flag. But for now we still have room
-	 * in the first cacheline of psi_group_cpu, and this way we
-	 * don't have to special case any state tracking for it.
-	 */
-	NR_ONCPU,
 	/*
 	 * For IO and CPU stalls the presence of running/oncpu tasks
 	 * in the domain means a partial rather than a full stall.
@@ -53,7 +46,7 @@ enum psi_task_count {
 	 * threads and memstall ones.
 	 */
 	NR_MEMSTALL_RUNNING,
-	NR_PSI_TASK_COUNTS = 5,
+	NR_PSI_TASK_COUNTS = 4,
 };
 #endif
 
@@ -61,15 +54,19 @@ enum psi_task_count {
 #define TSK_IOWAIT	(1 << NR_IOWAIT)
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
-#define TSK_ONCPU	(1 << NR_ONCPU)
 #define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
+/* Only one task can be scheduled, no corresponding task count */
+#define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
 	PSI_IO,
 	PSI_MEM,
 	PSI_CPU,
-	NR_PSI_RESOURCES = 3,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	PSI_IRQ,
+#endif
+	NR_PSI_RESOURCES,
 };
 
 /*
@@ -104,12 +101,17 @@ enum psi_states {
 	PSI_MEM_FULL,
 	PSI_CPU_SOME,
 	PSI_CPU_FULL,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	PSI_IRQ_FULL,
+#endif
 	/* Only per-CPU, to weigh the CPU in the global average: */
 	PSI_NONIDLE,
-	NR_PSI_STATES = 7,
+	NR_PSI_STATES,
 };
 #endif
 
+/* Use one bit in the state mask to track TSK_ONCPU */
+#define PSI_ONCPU	(1 << NR_PSI_STATES)
 
 enum psi_aggregators {
 	PSI_AVGS = 0,
@@ -229,10 +231,85 @@ struct psi_group {
 	u64 polling_until;
 };
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+
+enum psi_stat_states {
+	PSI_MEMCG_RECLAIM_SOME,
+	PSI_MEMCG_RECLAIM_FULL,
+	PSI_GLOBAL_RECLAIM_SOME,
+	PSI_GLOBAL_RECLAIM_FULL,
+	PSI_COMPACT_SOME,
+	PSI_COMPACT_FULL,
+	PSI_ASYNC_MEMCG_RECLAIM_SOME,
+	PSI_ASYNC_MEMCG_RECLAIM_FULL,
+	PSI_SWAP_SOME,
+	PSI_SWAP_FULL,
+	PSI_CPU_CFS_BANDWIDTH_FULL,
+#ifdef CONFIG_QOS_SCHED
+	PSI_CPU_QOS_FULL,
+#endif
+	NR_PSI_STAT_STATES,
+};
+
+enum psi_stat_task_count {
+	NR_MEMCG_RECLAIM,
+	NR_MEMCG_RECLAIM_RUNNING,
+	NR_GLOBAL_RECLAIM,
+	NR_GLOBAL_RECLAIM_RUNNING,
+	NR_COMPACT,
+	NR_COMPACT_RUNNING,
+	NR_ASYNC_MEMCG_RECLAIM,
+	NR_ASYNC_MEMCG_RECLAIM_RUNNING,
+	NR_SWAP,
+	NR_SWAP_RUNNING,
+	NR_PSI_STAT_TASK_COUNTS,
+};
+
+#define CPU_CFS_BANDWIDTH		1
+
+struct psi_group_stat_cpu {
+	u32 state_mask;
+	u32 times[NR_PSI_STAT_STATES];
+	u32 psi_delta;
+	unsigned int tasks[NR_PSI_STAT_TASK_COUNTS];
+	u32 times_delta;
+	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES];
+	int prev_throttle;
+	int cur_throttle;
+};
+
+struct psi_group_ext {
+	struct psi_group psi;
+	struct psi_group_stat_cpu __percpu *pcpu;
+	/* Running fine grained pressure averages */
+	u64 avg_total[NR_PSI_STAT_STATES];
+	/* Total fine grained stall times and sampled pressure averages */
+	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES];
+	unsigned long avg[NR_PSI_STAT_STATES][3];
+};
+#endif /* CONFIG_PSI_FINE_GRAINED */
+
 #else /* CONFIG_PSI */
 
 struct psi_group { };
 
 #endif /* CONFIG_PSI */
 
+/*
+ * one type should have two task stats: regular running and memstall
+ * threads. The reason is the same as NR_MEMSTALL_RUNNING.
+ * Because of the psi_memstall_type is start with 1, the correspondence
+ * between psi_memstall_type and psi_stat_task_count should be as below:
+ *
+ * memstall : psi_memstall_type * 2 - 2;
+ * running  : psi_memstall_type * 2 - 1;
+ */
+enum psi_memstall_type {
+	PSI_MEMCG_RECLAIM = 1,
+	PSI_GLOBAL_RECLAIM,
+	PSI_COMPACT,
+	PSI_ASYNC_MEMCG_RECLAIM,
+	PSI_SWAP,
+};
+
 #endif /* _LINUX_PSI_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d39427f8044d3a5a4b57f4ee9e7aefbe82b3c4a5..0a4c6a6214c4778e83e4d5f8031bbb4a1e810c09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1449,7 +1449,11 @@ struct task_struct {
 	KABI_RESERVE(10)
 	KABI_RESERVE(11)
 #endif
+#ifdef CONFIG_PSI_FINE_GRAINED
+	KABI_USE(12, int memstall_type)
+#else
 	KABI_RESERVE(12)
+#endif
 	KABI_RESERVE(13)
 	KABI_RESERVE(14)
 	KABI_RESERVE(15)
diff --git a/init/Kconfig b/init/Kconfig
index 83714edd7bf9f2db51b76c36dad71ede11ba6006..f5e32e1ba26f65b1315e492c70a31a01cb07b7d7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -653,6 +653,26 @@ config PSI_DEFAULT_DISABLED
 
 	  Say N if unsure.
 
+config PSI_CGROUP_V1
+       bool "Support PSI under cgroup v1"
+       default n
+       depends on PSI
+       help
+         If set, pressure stall information tracking will be used
+         for cgroup v1 other than v2.
+
+         Say N if unsure.
+
+config PSI_FINE_GRAINED
+	bool "Support fine grained psi under cgroup v1 and system"
+	default n
+	depends on PSI
+	help
+	  If set, fine grained pressure stall information tracking will
+	  be used for cgroup v1 and system, such as memory reclaim,
+	  memory compact and so on.
+	  Say N if unsure.
+
 endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3d778636f2e8195fb60827f60ccf56bcb85fd508..c68b81a0c57360115b2e327bb28d65a44d029f3e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3677,21 +3677,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 
 	return psi_show(seq, psi, PSI_CPU);
 }
@@ -3717,7 +3717,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 		return -EBUSY;
 	}
 
-	psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+	psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 	new = psi_trigger_create(psi, buf, nbytes, res, of);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
@@ -3751,6 +3751,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
 	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+
+	return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+					 char *buf, size_t nbytes,
+					 loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
 					  poll_table *pt)
 {
@@ -3766,6 +3783,17 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 	psi_trigger_destroy(ctx->psi.trigger);
 }
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+static int cgroup_psi_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+
+	return psi_stat_show(seq, psi);
+}
+#endif
+
+#ifdef CONFIG_PSI_CGROUP_V1
 struct cftype cgroup_v1_psi_files[] = {
 	{
 		.name = "io.pressure",
@@ -3791,8 +3819,27 @@ struct cftype cgroup_v1_psi_files[] = {
 		.poll = cgroup_pressure_poll,
 		.release = cgroup_pressure_release,
 	},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	{
+		.name = "irq.pressure",
+		.flags = CFTYPE_NO_PREFIX,
+		.seq_show = cgroup_irq_pressure_show,
+		.write = cgroup_irq_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
+	},
+#endif
+#ifdef CONFIG_PSI_FINE_GRAINED
+	{
+		.name = "pressure.stat",
+		.flags = CFTYPE_NO_PREFIX,
+		.seq_show = cgroup_psi_stat_show,
+	},
+#endif
 	{ } /* terminate */
 };
+#endif
+
 #endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -5155,6 +5202,15 @@ static struct cftype cgroup_base_files[] = {
 		.poll = cgroup_pressure_poll,
 		.release = cgroup_pressure_release,
 	},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	{
+		.name = "irq.pressure",
+		.seq_show = cgroup_irq_pressure_show,
+		.write = cgroup_irq_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
+	},
+#endif
 #endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebd8c3a6a964f71eb483e62b5006daa5c9eb2b55..92ba14c0bcaa04a0c832e9c730d1834b1d305eea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -629,6 +629,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
+	if (irq_delta)
+		psi_account_irqtime(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 28ed182b6801cda325d7c93ebf7cb938168ff714..7a7a0dec8c4e053ff00a73077b3c3d6ac10747df 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -375,7 +375,7 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
 	.early_init	= true,
 };
 
-#ifdef CONFIG_PSI
+#ifdef CONFIG_PSI_CGROUP_V1
 
 static bool psi_v1_enable;
 static int __init setup_psi_v1(char *str)
@@ -383,8 +383,8 @@ static int __init setup_psi_v1(char *str)
 	int ret;
 
 	ret = kstrtobool(str, &psi_v1_enable);
-	if (!psi_v1_enable)
-		static_branch_enable(&psi_v1_disabled);
+	if (psi_v1_enable)
+		static_branch_disable(&psi_v1_disabled);
 
 	return ret == 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06c6318b3ba3cd0fe25880f0a6773969a0be6284..61b077d630d63ba52d29ac78ffa4d72c57dcf708 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -125,12 +125,6 @@ int __weak arch_asym_cpu_priority(int cpu)
 
 #ifdef CONFIG_QOS_SCHED
 
-/*
- * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled
- * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1).
- */
-#define QOS_THROTTLED	2
-
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer);
 static DEFINE_PER_CPU(int, qos_cpu_overload);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 11a43dccb7fcbaad9279eb294376b8d1fa6d65a2..5789b07e59dfb65f192c601db6192defa57e7749 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -156,7 +156,10 @@
 static int psi_bug __read_mostly;
 
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
-DEFINE_STATIC_KEY_FALSE(psi_v1_disabled);
+
+#ifdef CONFIG_PSI_CGROUP_V1
+DEFINE_STATIC_KEY_TRUE(psi_v1_disabled);
+#endif
 
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
 static bool psi_enable;
@@ -189,6 +192,27 @@ struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+/* System-level fine grained pressure and stall tracking */
+static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu);
+struct psi_group_ext psi_stat_system = {
+	.pcpu = &system_stat_group_pcpu,
+};
+
+struct psi_group_ext *to_psi_group_ext(struct psi_group *psi)
+{
+	if (psi == &psi_system)
+		return &psi_stat_system;
+	else
+		return container_of(psi, struct psi_group_ext, psi);
+}
+#else
+static inline struct psi_group_ext *to_psi_group_ext(struct psi_group *psi)
+{
+	return NULL;
+}
+#endif
+
 static void psi_avgs_work(struct work_struct *work);
 
 static void poll_timer_fn(struct timer_list *t);
@@ -206,12 +230,8 @@ static void group_init(struct psi_group *group)
 	/* Init trigger-related members */
 	mutex_init(&group->trigger_lock);
 	INIT_LIST_HEAD(&group->triggers);
-	memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
-	group->poll_states = 0;
 	group->poll_min_period = U32_MAX;
-	memset(group->polling_total, 0, sizeof(group->polling_total));
 	group->polling_next_update = ULLONG_MAX;
-	group->polling_until = 0;
 	init_waitqueue_head(&group->poll_wait);
 	timer_setup(&group->poll_timer, poll_timer_fn, 0);
 	rcu_assign_pointer(group->poll_task, NULL);
@@ -228,7 +248,7 @@ void __init psi_init(void)
 	group_init(&psi_system);
 }
 
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
 {
 	switch (state) {
 	case PSI_IO_SOME:
@@ -241,9 +261,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 		return unlikely(tasks[NR_MEMSTALL] &&
 			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
 	case PSI_CPU_SOME:
-		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+		return unlikely(tasks[NR_RUNNING] > oncpu);
 	case PSI_CPU_FULL:
-		return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+		return unlikely(tasks[NR_RUNNING] && !oncpu);
 	case PSI_NONIDLE:
 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
 			tasks[NR_RUNNING];
@@ -256,6 +276,10 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			     enum psi_aggregators aggregator, u32 *times,
 			     u32 *pchanged_states)
 {
+#ifdef CONFIG_PSI_FINE_GRAINED
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+#endif
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
 	u64 now, state_start;
 	enum psi_states s;
@@ -295,6 +319,9 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (delta)
 			*pchanged_states |= (1 << s);
 	}
+#ifdef CONFIG_PSI_FINE_GRAINED
+	ext_groupc->times_delta = now - state_start;
+#endif
 }
 
 static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -317,10 +344,240 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+
+static void record_stat_times(struct psi_group_ext *psi_ext, int cpu)
+{
+	struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu);
+
+	u32 delta = ext_grpc->psi_delta;
+
+	if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) {
+		ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta;
+		if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL))
+			ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta;
+	}
+	if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_SOME)) {
+		ext_grpc->times[PSI_GLOBAL_RECLAIM_SOME] += delta;
+		if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_FULL))
+			ext_grpc->times[PSI_GLOBAL_RECLAIM_FULL] += delta;
+	}
+	if (ext_grpc->state_mask & (1 << PSI_COMPACT_SOME)) {
+		ext_grpc->times[PSI_COMPACT_SOME] += delta;
+		if (ext_grpc->state_mask & (1 << PSI_COMPACT_FULL))
+			ext_grpc->times[PSI_COMPACT_FULL] += delta;
+	}
+	if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_SOME)) {
+		ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_SOME] += delta;
+		if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_FULL))
+			ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_FULL] += delta;
+	}
+	if (ext_grpc->state_mask & (1 << PSI_SWAP_SOME)) {
+		ext_grpc->times[PSI_SWAP_SOME] += delta;
+		if (ext_grpc->state_mask & (1 << PSI_SWAP_FULL))
+			ext_grpc->times[PSI_SWAP_FULL] += delta;
+	}
+}
+
+static bool test_fine_grained_stat(unsigned int *stat_tasks,
+				   unsigned int nr_running,
+				   enum psi_stat_states state)
+{
+	switch (state) {
+	case PSI_MEMCG_RECLAIM_SOME:
+		return unlikely(stat_tasks[NR_MEMCG_RECLAIM]);
+	case PSI_MEMCG_RECLAIM_FULL:
+		return unlikely(stat_tasks[NR_MEMCG_RECLAIM] &&
+		       nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]);
+	case PSI_GLOBAL_RECLAIM_SOME:
+		return unlikely(stat_tasks[NR_GLOBAL_RECLAIM]);
+	case PSI_GLOBAL_RECLAIM_FULL:
+		return unlikely(stat_tasks[NR_GLOBAL_RECLAIM] &&
+		       nr_running == stat_tasks[NR_GLOBAL_RECLAIM_RUNNING]);
+	case PSI_COMPACT_SOME:
+		return unlikely(stat_tasks[NR_COMPACT]);
+	case PSI_COMPACT_FULL:
+		return unlikely(stat_tasks[NR_COMPACT] &&
+		       nr_running == stat_tasks[NR_COMPACT_RUNNING]);
+	case PSI_ASYNC_MEMCG_RECLAIM_SOME:
+		return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM]);
+	case PSI_ASYNC_MEMCG_RECLAIM_FULL:
+		return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM] &&
+		      nr_running == stat_tasks[NR_ASYNC_MEMCG_RECLAIM_RUNNING]);
+	case PSI_SWAP_SOME:
+		return unlikely(stat_tasks[NR_SWAP]);
+	case PSI_SWAP_FULL:
+		return unlikely(stat_tasks[NR_SWAP] &&
+		       nr_running == stat_tasks[NR_SWAP_RUNNING]);
+	default:
+		return false;
+	}
+}
+
+static void psi_group_stat_change(struct psi_group *group, int cpu,
+				  int clear, int set)
+{
+	int t;
+	u32 state_mask = 0;
+	enum psi_stat_states s;
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+
+	write_seqcount_begin(&groupc->seq);
+	record_stat_times(psi_ext, cpu);
+
+	for (t = 0; clear; clear &= ~(1 << t), t++)
+		if (clear & (1 << t))
+			ext_groupc->tasks[t]--;
+	for (t = 0; set; set &= ~(1 << t), t++)
+		if (set & (1 << t))
+			ext_groupc->tasks[t]++;
+	for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++)
+		if (test_fine_grained_stat(ext_groupc->tasks,
+					   groupc->tasks[NR_RUNNING], s))
+			state_mask |= (1 << s);
+	if (unlikely(groupc->state_mask & PSI_ONCPU) &&
+		     cpu_curr(cpu)->memstall_type)
+		state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1));
+
+	ext_groupc->state_mask = state_mask;
+	write_seqcount_end(&groupc->seq);
+}
+
+static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now)
+{
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+	ext_groupc->psi_delta = now - groupc->state_start;
+}
+
+static void psi_stat_flags_change(struct task_struct *task, int *stat_set,
+				  int *stat_clear, int set, int clear)
+{
+	if (!task->memstall_type)
+		return;
+
+	if (clear) {
+		if (clear & TSK_MEMSTALL)
+			*stat_clear |= 1 << (2 * task->memstall_type - 2);
+		if (clear & TSK_MEMSTALL_RUNNING)
+			*stat_clear |= 1 << (2 * task->memstall_type - 1);
+	}
+	if (set) {
+		if (set & TSK_MEMSTALL)
+			*stat_set |= 1 << (2 * task->memstall_type - 2);
+		if (set & TSK_MEMSTALL_RUNNING)
+			*stat_set |= 1 << (2 * task->memstall_type - 1);
+	}
+	if (!task->in_memstall)
+		task->memstall_type = 0;
+}
+
+static void get_recent_stat_times(struct psi_group *group, int cpu,
+				  enum psi_aggregators aggregator, u32 *times)
+{
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+	enum psi_stat_states s;
+	u32 delta;
+
+	memcpy(times, ext_groupc->times, sizeof(ext_groupc->times));
+	for (s = 0; s < NR_PSI_STAT_STATES; s++) {
+		if (ext_groupc->state_mask & (1 << s))
+			times[s] += ext_groupc->times_delta;
+		delta = times[s] - ext_groupc->times_prev[aggregator][s];
+		ext_groupc->times_prev[aggregator][s] = times[s];
+		times[s] = delta;
+	}
+}
+
+static void update_stat_averages(struct psi_group_ext *psi_ext,
+				 unsigned long missed_periods, u64 period)
+{
+	int s;
+
+	for (s = 0; s < NR_PSI_STAT_STATES; s++) {
+		u32 sample;
+
+		sample = psi_ext->total[PSI_AVGS][s] - psi_ext->avg_total[s];
+		if (sample > period)
+			sample = period;
+		psi_ext->avg_total[s] += sample;
+		calc_avgs(psi_ext->avg[s], missed_periods, sample, period);
+	}
+}
+#else
+static inline void psi_group_stat_change(struct psi_group *group, int cpu,
+					 int clear, int set) {}
+static inline void update_psi_stat_delta(struct psi_group *group, int cpu,
+					 u64 now) {}
+static inline void psi_stat_flags_change(struct task_struct *task,
+					 int *stat_set, int *stat_clear,
+					 int set, int clear) {}
+static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {}
+static inline void update_stat_averages(struct psi_group_ext *psi_ext,
+					unsigned long missed_periods,
+					u64 period) {}
+#endif
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \
+	defined(CONFIG_PSI_FINE_GRAINED)
+static void record_cpu_stat_times(struct psi_group *group, int cpu)
+{
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+	u32 delta = ext_groupc->psi_delta;
+
+	if (groupc->state_mask & (1 << PSI_CPU_FULL)) {
+		if (ext_groupc->prev_throttle == CPU_CFS_BANDWIDTH)
+			ext_groupc->times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta;
+#ifdef CONFIG_QOS_SCHED
+		else if (ext_groupc->prev_throttle == QOS_THROTTLED)
+			ext_groupc->times[PSI_CPU_QOS_FULL] += delta;
+#endif
+	}
+}
+
+static void update_throttle_type(struct task_struct *task, int cpu, bool next)
+{
+	struct cgroup *cpuacct_cgrp;
+	struct psi_group_ext *psi_ext;
+	struct psi_group_stat_cpu *groupc;
+	struct task_group *tsk_grp;
+
+	if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) {
+		rcu_read_lock();
+		cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id);
+		if (cgroup_parent(cpuacct_cgrp)) {
+			psi_ext = to_psi_group_ext(cgroup_psi(cpuacct_cgrp));
+			groupc = per_cpu_ptr(psi_ext->pcpu, cpu);
+			tsk_grp = task_group(task);
+			if (next)
+				groupc->prev_throttle = groupc->cur_throttle;
+			groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled;
+		}
+		rcu_read_unlock();
+	}
+}
+#else
+static inline void record_cpu_stat_times(struct psi_group *group, int cpu) {}
+static inline void update_throttle_type(struct task_struct *task, int cpu,
+					bool next) {}
+#endif
+
 static void collect_percpu_times(struct psi_group *group,
 				 enum psi_aggregators aggregator,
 				 u32 *pchanged_states)
 {
+#ifdef CONFIG_PSI_FINE_GRAINED
+	u64 stat_delta[NR_PSI_STAT_STATES] = { 0 };
+	u32 stat_times[NR_PSI_STAT_STATES] = { 0 };
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
+#endif
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
 	u32 changed_states = 0;
@@ -349,6 +606,11 @@ static void collect_percpu_times(struct psi_group *group,
 
 		for (s = 0; s < PSI_NONIDLE; s++)
 			deltas[s] += (u64)times[s] * nonidle;
+#ifdef CONFIG_PSI_FINE_GRAINED
+		get_recent_stat_times(group, cpu, aggregator, stat_times);
+		for (s = 0; s < NR_PSI_STAT_STATES; s++)
+			stat_delta[s] += (u64)stat_times[s] * nonidle;
+#endif
 	}
 
 	/*
@@ -368,12 +630,19 @@ static void collect_percpu_times(struct psi_group *group,
 		group->total[aggregator][s] +=
 				div_u64(deltas[s], max(nonidle_total, 1UL));
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+	for (s = 0; s < NR_PSI_STAT_STATES; s++)
+		psi_ext->total[aggregator][s] +=
+				div_u64(stat_delta[s], max(nonidle_total, 1UL));
+#endif
+
 	if (pchanged_states)
 		*pchanged_states = changed_states;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
 {
+	struct psi_group_ext *psi_ext = to_psi_group_ext(group);
 	unsigned long missed_periods = 0;
 	u64 expires, period;
 	u64 avg_next_update;
@@ -422,6 +691,7 @@ static u64 update_averages(struct psi_group *group, u64 now)
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
 
+	update_stat_averages(psi_ext, missed_periods, period);
 	return avg_next_update;
 }
 
@@ -696,9 +966,9 @@ static void psi_group_change(struct psi_group *group, int cpu,
 			     bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
-	u32 state_mask = 0;
 	unsigned int t, m;
 	enum psi_states s;
+	u32 state_mask;
 
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
@@ -713,18 +983,38 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	write_seqcount_begin(&groupc->seq);
 
 	record_times(groupc, now);
+	record_cpu_stat_times(group, cpu);
 
+	/*
+	 * Start with TSK_ONCPU, which doesn't have a corresponding
+	 * task count - it's just a boolean flag directly encoded in
+	 * the state mask. Clear, set, or carry the current state if
+	 * no changes are requested.
+	 */
+	if (unlikely(clear & TSK_ONCPU)) {
+		state_mask = 0;
+		clear &= ~TSK_ONCPU;
+	} else if (unlikely(set & TSK_ONCPU)) {
+		state_mask = PSI_ONCPU;
+		set &= ~TSK_ONCPU;
+	} else {
+		state_mask = groupc->state_mask & PSI_ONCPU;
+	}
+
+	/*
+	 * The rest of the state mask is calculated based on the task
+	 * counts. Update those first, then construct the mask.
+	 */
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
 			continue;
 		if (groupc->tasks[t]) {
 			groupc->tasks[t]--;
 		} else if (!psi_bug) {
-			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
 					cpu, t, groupc->tasks[0],
 					groupc->tasks[1], groupc->tasks[2],
-					groupc->tasks[3], groupc->tasks[4],
-					clear, set);
+					groupc->tasks[3], clear, set);
 			psi_bug = 1;
 		}
 	}
@@ -733,9 +1023,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (set & (1 << t))
 			groupc->tasks[t]++;
 
-	/* Calculate state mask representing active states */
 	for (s = 0; s < NR_PSI_STATES; s++) {
-		if (test_state(groupc->tasks, s))
+		if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
 			state_mask |= (1 << s);
 	}
 
@@ -747,7 +1036,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 * task in a cgroup is in_memstall, the corresponding groupc
 	 * on that cpu is in PSI_MEM_FULL state.
 	 */
-	if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+	if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
 		state_mask |= (1 << PSI_MEM_FULL);
 
 	groupc->state_mask = state_mask;
@@ -767,21 +1056,23 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 	struct cgroup *cgroup = NULL;
 
 	if (!*iter) {
-		if (static_branch_likely(&psi_v1_disabled))
-			cgroup = task->cgroups->dfl_cgrp;
-		else {
+#ifndef CONFIG_PSI_CGROUP_V1
+		cgroup = task->cgroups->dfl_cgrp;
+#else
 #ifdef CONFIG_CGROUP_CPUACCT
-			if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) {
+		if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) {
+			if (!static_branch_likely(&psi_v1_disabled)) {
 				rcu_read_lock();
 				cgroup = task_cgroup(task, cpuacct_cgrp_id);
 				rcu_read_unlock();
-			} else {
-				cgroup = task->cgroups->dfl_cgrp;
 			}
+		} else {
+			cgroup = task->cgroups->dfl_cgrp;
+		}
 #else
-			cgroup = NULL;
+		cgroup = NULL;
+#endif
 #endif
-		}
 	} else if (*iter == &psi_system)
 		return NULL;
 	else
@@ -818,29 +1109,24 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
-	bool wake_clock = true;
 	void *iter = NULL;
 	u64 now;
+	int stat_set = 0;
+	int stat_clear = 0;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
+	psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear);
 
 	now = cpu_clock(cpu);
-	/*
-	 * Periodic aggregation shuts off if there is a period of no
-	 * task changes, so we wake it back up if necessary. However,
-	 * don't do this if the task change is the aggregation worker
-	 * itself going to sleep, or we'll ping-pong forever.
-	 */
-	if (unlikely((clear & TSK_RUNNING) &&
-		     (task->flags & PF_WQ_WORKER) &&
-		     wq_worker_last_func(task) == psi_avgs_work))
-		wake_clock = false;
 
-	while ((group = iterate_groups(task, &iter)))
-		psi_group_change(group, cpu, clear, set, now, wake_clock);
+	while ((group = iterate_groups(task, &iter))) {
+		update_psi_stat_delta(group, cpu, now);
+		psi_group_change(group, cpu, clear, set, now, true);
+		psi_group_stat_change(group, cpu, stat_clear, stat_set);
+	}
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -852,32 +1138,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	u64 now = cpu_clock(cpu);
 
 	if (next->pid) {
-		bool identical_state;
-
+		update_throttle_type(next, cpu, true);
 		psi_flags_change(next, 0, TSK_ONCPU);
 		/*
-		 * When switching between tasks that have an identical
-		 * runtime state, the cgroup that contains both tasks
-		 * runtime state, the cgroup that contains both tasks
-		 * we reach the first common ancestor. Iterate @next's
-		 * ancestors only until we encounter @prev's ONCPU.
+		 * Set TSK_ONCPU on @next's cgroups. If @next shares any
+		 * ancestors with @prev, those will already have @prev's
+		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 */
-		identical_state = prev->psi_flags == next->psi_flags;
 		iter = NULL;
 		while ((group = iterate_groups(next, &iter))) {
-			if (identical_state &&
-			    per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+			    PSI_ONCPU) {
 				common = group;
 				break;
 			}
 
+			update_psi_stat_delta(group, cpu, now);
 			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+			psi_group_stat_change(group, cpu, 0, 0);
 		}
 	}
 
 	if (prev->pid) {
 		int clear = TSK_ONCPU, set = 0;
+		bool wake_clock = true;
+		int stat_set = 0;
+		int stat_clear = 0;
+		bool memstall_type_change = false;
 
+		update_throttle_type(prev, cpu, false);
 		/*
 		 * When we're going to sleep, psi_dequeue() lets us
 		 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
@@ -890,26 +1179,83 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				clear |= TSK_MEMSTALL_RUNNING;
 			if (prev->in_iowait)
 				set |= TSK_IOWAIT;
+
+			/*
+			 * Periodic aggregation shuts off if there is a period of no
+			 * task changes, so we wake it back up if necessary. However,
+			 * don't do this if the task change is the aggregation worker
+			 * itself going to sleep, or we'll ping-pong forever.
+			 */
+			if (unlikely((prev->flags & PF_WQ_WORKER) &&
+				     wq_worker_last_func(prev) == psi_avgs_work))
+				wake_clock = false;
 		}
 
 		psi_flags_change(prev, clear, set);
+		psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear);
 
 		iter = NULL;
-		while ((group = iterate_groups(prev, &iter)) && group != common)
-			psi_group_change(group, cpu, clear, set, now, true);
-
+		while ((group = iterate_groups(prev, &iter)) && group != common) {
+			update_psi_stat_delta(group, cpu, now);
+			psi_group_change(group, cpu, clear, set, now, wake_clock);
+			psi_group_stat_change(group, cpu, stat_clear, stat_set);
+		}
+#ifdef CONFIG_PSI_FINE_GRAINED
+		if (next->memstall_type != prev->memstall_type)
+			memstall_type_change = true;
+#endif
 		/*
-		 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
-		 * with dequeuing too, finish that for the rest of the hierarchy.
+		 * TSK_ONCPU is handled up to the common ancestor. If there are
+		 * any other differences between the two tasks (e.g. prev goes
+		 * to sleep, or only one task is memstall), finish propagating
+		 * those differences all the way up to the root.
 		 */
-		if (sleep) {
+		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU ||
+		     memstall_type_change) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = iterate_groups(prev, &iter))
-				psi_group_change(group, cpu, clear, set, now, true);
+			for (; group; group = iterate_groups(prev, &iter)) {
+				update_psi_stat_delta(group, cpu, now);
+				psi_group_change(group, cpu, clear, set, now, wake_clock);
+				psi_group_stat_change(group, cpu, stat_clear,
+						      stat_set);
+			}
 		}
 	}
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+	int cpu = task_cpu(task);
+	void *iter = NULL;
+	struct psi_group *group;
+	struct psi_group_cpu *groupc;
+	u64 now;
+
+	if (!task->pid)
+		return;
+
+	now = cpu_clock(cpu);
+
+	while ((group = iterate_groups(task, &iter))) {
+		groupc = per_cpu_ptr(group->pcpu, cpu);
+
+		write_seqcount_begin(&groupc->seq);
+
+		update_psi_stat_delta(group, cpu, now);
+		record_stat_times(to_psi_group_ext(group), cpu);
+		record_times(groupc, now);
+		record_cpu_stat_times(group, cpu);
+		groupc->times[PSI_IRQ_FULL] += delta;
+
+		write_seqcount_end(&groupc->seq);
+
+		if (group->poll_states & (1 << PSI_IRQ_FULL))
+			psi_schedule_poll_work(group, 1);
+	}
+}
+#endif
+
 /**
  * psi_memstall_enter - mark the beginning of a memory stall section
  * @flags: flags to handle nested sections
@@ -921,6 +1267,9 @@ void psi_memstall_enter(unsigned long *flags)
 {
 	struct rq_flags rf;
 	struct rq *rq;
+#ifdef CONFIG_PSI_FINE_GRAINED
+	unsigned long stat_flags = *flags;
+#endif
 
 	if (static_branch_likely(&psi_disabled))
 		return;
@@ -938,6 +1287,10 @@ void psi_memstall_enter(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 1;
+#ifdef CONFIG_PSI_FINE_GRAINED
+	if (stat_flags)
+		current->memstall_type = stat_flags;
+#endif
 	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	rq_unlock_irq(rq, &rf);
@@ -961,6 +1314,7 @@ void psi_memstall_leave(unsigned long *flags)
 		return;
 
 	trace_psi_memstall_leave(_RET_IP_);
+
 	/*
 	 * in_memstall clearing & accounting needs to be atomic wrt
 	 * changes to the task's scheduling state, otherwise we could
@@ -977,13 +1331,40 @@ void psi_memstall_leave(unsigned long *flags)
 #ifdef CONFIG_CGROUPS
 int psi_cgroup_alloc(struct cgroup *cgroup)
 {
+#ifdef CONFIG_PSI_FINE_GRAINED
+	struct psi_group_ext *psi_ext;
+#endif
+
 	if (static_branch_likely(&psi_disabled))
 		return 0;
 
-	cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
-	if (!cgroup->psi.pcpu)
+#ifdef CONFIG_PSI_FINE_GRAINED
+	psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL);
+	if (!psi_ext)
+		return -ENOMEM;
+	psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu);
+	if (!psi_ext->pcpu) {
+		kfree(psi_ext);
+		return -ENOMEM;
+	}
+	cgroup->psi = &psi_ext->psi;
+#else
+	cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
+	if (!cgroup->psi)
+		return -ENOMEM;
+
+#endif
+	cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+	if (!cgroup->psi->pcpu) {
+#ifdef CONFIG_PSI_FINE_GRAINED
+		free_percpu(psi_ext->pcpu);
+		kfree(psi_ext);
+#else
+		kfree(cgroup->psi);
+#endif
 		return -ENOMEM;
-	group_init(&cgroup->psi);
+	}
+	group_init(cgroup->psi);
 	return 0;
 }
 
@@ -992,10 +1373,16 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
-	free_percpu(cgroup->psi.pcpu);
+	cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+	free_percpu(cgroup->psi->pcpu);
 	/* All triggers must be removed by now */
-	WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
+	WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+#ifdef CONFIG_PSI_FINE_GRAINED
+	free_percpu(to_psi_group_ext(cgroup->psi)->pcpu);
+	kfree(to_psi_group_ext(cgroup->psi));
+#else
+	kfree(cgroup->psi);
+#endif
 }
 
 /**
@@ -1068,6 +1455,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 
 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
+	bool only_full = false;
 	int full;
 	u64 now;
 
@@ -1082,7 +1470,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
 
-	for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	only_full = res == PSI_IRQ;
+#endif
+
+	for (full = 0; full < 2 - only_full; full++) {
 		unsigned long avg[3] = { 0, };
 		u64 total = 0;
 		int w;
@@ -1096,7 +1488,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 		}
 
 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
-			   full ? "full" : "some",
+			   full || only_full ? "full" : "some",
 			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
 			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
 			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,36 +1498,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	return 0;
 }
 
-static int psi_io_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, psi_io_show, NULL);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, psi_memory_show, NULL);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, psi_cpu_show, NULL);
-}
-
 struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
 				       size_t nbytes, enum psi_res res,
 				       struct kernfs_open_file *of)
@@ -1155,6 +1517,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
 	else
 		return ERR_PTR(-EINVAL);
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+		return ERR_PTR(-EINVAL);
+#endif
+
 	if (state >= PSI_NONIDLE)
 		return ERR_PTR(-EINVAL);
 
@@ -1304,6 +1671,37 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
 	return ret;
 }
 
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psi_cpu_show, NULL);
+}
+
 static ssize_t psi_write(struct file *file, const char __user *user_buf,
 			 size_t nbytes, enum psi_res res)
 {
@@ -1407,6 +1805,107 @@ static const struct proc_ops psi_cpu_proc_ops = {
 	.proc_release	= psi_fop_release,
 };
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+static const char *const psi_stat_names[] = {
+	"cgroup_memory_reclaim",
+	"global_memory_reclaim",
+	"compact",
+	"cgroup_async_memory_reclaim",
+	"swap",
+	"cpu_cfs_bandwidth",
+	"cpu_qos",
+};
+
+static void get_stat_names(struct seq_file *m, int i, bool is_full)
+{
+	if (i <= PSI_SWAP_FULL && !is_full)
+		return seq_printf(m, "%s\n", psi_stat_names[i / 2]);
+	else if (i == PSI_CPU_CFS_BANDWIDTH_FULL)
+		return seq_printf(m, "%s\n", "cpu_cfs_bandwidth");
+#ifdef CONFIG_QOS_SCHED
+	else if (i == PSI_CPU_QOS_FULL)
+		return seq_printf(m, "%s\n", "cpu_qos");
+#endif
+}
+
+int psi_stat_show(struct seq_file *m, struct psi_group *group)
+{
+	struct psi_group_ext *psi_ext;
+	unsigned long avg[3] = {0, };
+	int i, w;
+	bool is_full;
+	u64 now, total;
+
+	if (static_branch_likely(&psi_disabled))
+		return -EOPNOTSUPP;
+
+	psi_ext = to_psi_group_ext(group);
+	mutex_lock(&group->avgs_lock);
+	now = sched_clock();
+	collect_percpu_times(group, PSI_AVGS, NULL);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
+	mutex_unlock(&group->avgs_lock);
+	for (i = 0; i < NR_PSI_STAT_STATES; i++) {
+		is_full = i % 2 || i > PSI_SWAP_FULL;
+		for (w = 0; w < 3; w++)
+			avg[w] = psi_ext->avg[i][w];
+		total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC);
+		get_stat_names(m, i, is_full);
+		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+			   is_full ? "full" : "some",
+			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+			   total);
+	}
+	return 0;
+}
+static int system_psi_stat_show(struct seq_file *m, void *v)
+{
+	return psi_stat_show(m, &psi_system);
+}
+
+static int psi_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, system_psi_stat_show, NULL);
+}
+
+static const struct proc_ops psi_stat_proc_ops = {
+	.proc_open	= psi_stat_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= psi_fop_release,
+};
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psi_irq_show, NULL);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+			     size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+	.proc_open	= psi_irq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_write	= psi_irq_write,
+	.proc_poll	= psi_fop_poll,
+	.proc_release	= psi_fop_release,
+};
+#endif
+
 static int __init psi_proc_init(void)
 {
 	if (psi_enable) {
@@ -1414,7 +1913,15 @@ static int __init psi_proc_init(void)
 		proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
 		proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
 		proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+		proc_create("pressure/irq", 0, NULL, &psi_irq_proc_ops);
+#endif
+#ifdef CONFIG_PSI_FINE_GRAINED
+		proc_create("pressure/stat", 0, NULL, &psi_stat_proc_ops);
+#endif
 	}
 	return 0;
 }
 module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b8b4e5b2694e94017e6c4e41451ca8bbb9c0a616..4fc84b0e29450d97cf12062eb956c60aff1210cd 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -75,6 +75,14 @@ static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delt
 # define   schedstat_end_time(rq, t)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+#ifdef CONFIG_QOS_SCHED
+/*
+ * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled
+ * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1).
+ */
+#define QOS_THROTTLED	2
+#endif
+
 #ifdef CONFIG_PSI
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
@@ -170,6 +178,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO
diff --git a/mm/compaction.c b/mm/compaction.c
index a193af836ee6994705d892d271d6eb2544ac062a..bdcde6ea7f97eb572bf87b1bf12ba0d79868dfc4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2852,7 +2852,7 @@ static int kcompactd(void *p)
 	pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
 
 	while (!kthread_should_stop()) {
-		unsigned long pflags;
+		unsigned long pflags = 0;
 
 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
 		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
diff --git a/mm/filemap.c b/mm/filemap.c
index fd4aae06ff150cbee001838854eb95e065695d94..04e4aad7ed67e42541bfd4bbe7ed7fdbb56c1053 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1194,7 +1194,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	wait_queue_entry_t *wait = &wait_page.wait;
 	bool thrashing = false;
 	bool delayacct = false;
-	unsigned long pflags;
+	unsigned long pflags = 0;
 
 	if (bit_nr == PG_locked &&
 	    !PageUptodate(page) && PageWorkingset(page)) {
@@ -1351,7 +1351,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
 	wait_queue_entry_t *wait = &wait_page.wait;
 	bool thrashing = false;
 	bool delayacct = false;
-	unsigned long pflags;
+	unsigned long pflags = 0;
 	wait_queue_head_t *q;
 	struct page *page = compound_head(migration_entry_to_page(entry));
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2da152a09ea3a90956fd9383f18550c19e0f5452..b4607e8e557b8c9491bb675b2e67c2c650e8bd8a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -110,18 +110,14 @@ static bool do_memsw_account(void)
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
 /*
- * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_DEFAULT, memcg async
+ * memcg warning watermark = memory.high * memcg->high_async_ratio /
+ * HIGH_ASYNC_RATIO_BASE.
+ * when memcg usage is larger than warning watermark, but smaller than
+ * memory.high, start memcg async reclaim;
+ * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_BASE, memcg async
  * relcaim is disabled;
- * when mem_usage is larger than memory.high * memcg->high_async_ratio/
- * HIGH_ASYNC_RATIO_BASE, start async reclaim;
- * if mem_usage is larger than memory.high * (memcg->high_async_ratio -
- * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE, the aim reclaim page is
- * the diff of mem_usage and memory.high * (memcg->high_async_ratio -
- * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE else the aim reclaim
- * page is MEMCG_CHARGE_BATCH;
- */
+ * */
 
-#define HIGH_ASYNC_RATIO_DEFAULT		0
 #define HIGH_ASYNC_RATIO_BASE			100
 #define HIGH_ASYNC_RATIO_GAP			10
 
@@ -2370,15 +2366,13 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 static bool is_high_async_reclaim(struct mem_cgroup *memcg)
 {
 	int ratio = READ_ONCE(memcg->high_async_ratio);
+	unsigned long memcg_high = READ_ONCE(memcg->memory.high);
 
-	if (ratio == HIGH_ASYNC_RATIO_DEFAULT)
-		return false;
-
-	if (READ_ONCE(memcg->memory.high) == PAGE_COUNTER_MAX)
+	if (ratio == HIGH_ASYNC_RATIO_BASE || memcg_high == PAGE_COUNTER_MAX)
 		return false;
 
 	return page_counter_read(&memcg->memory) >
-	       (READ_ONCE(memcg->memory.high) * ratio / HIGH_ASYNC_RATIO_BASE);
+	       memcg_high * ratio / HIGH_ASYNC_RATIO_BASE;
 }
 
 static unsigned long reclaim_high(struct mem_cgroup *memcg,
@@ -2386,25 +2380,19 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 				  gfp_t gfp_mask)
 {
 	unsigned long nr_reclaimed = 0;
-	bool high_async_reclaim = READ_ONCE(memcg->high_async_reclaim);
-
-	if (high_async_reclaim)
-		WRITE_ONCE(memcg->high_async_reclaim, false);
 
 	do {
 		unsigned long pflags;
 
-		if (high_async_reclaim) {
-			if (!is_high_async_reclaim(memcg))
-				continue;
-		} else {
-			if (page_counter_read(&memcg->memory) <=
-			    READ_ONCE(memcg->memory.high))
-				continue;
-		}
+		if (page_counter_read(&memcg->memory) <=
+		    READ_ONCE(memcg->memory.high))
+			continue;
 
 		memcg_memory_event(memcg, MEMCG_HIGH);
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+		pflags = PSI_MEMCG_RECLAIM;
+#endif
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
@@ -2416,27 +2404,37 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 	return nr_reclaimed;
 }
 
-static unsigned long get_reclaim_pages(struct mem_cgroup *memcg)
+static void async_reclaim_high(struct mem_cgroup *memcg)
 {
-	unsigned long nr_pages = page_counter_read(&memcg->memory);
-	int ratio = READ_ONCE(memcg->high_async_ratio);
-	unsigned long safe_pages;
+	unsigned long nr_pages, pflags;
+	unsigned long memcg_high = READ_ONCE(memcg->memory.high);
+	unsigned long memcg_usage = page_counter_read(&memcg->memory);
+	int ratio = READ_ONCE(memcg->high_async_ratio) - HIGH_ASYNC_RATIO_GAP;
+	unsigned long safe_pages = memcg_high * ratio / HIGH_ASYNC_RATIO_BASE;
 
-	ratio = ratio < HIGH_ASYNC_RATIO_GAP ? 0 : ratio - HIGH_ASYNC_RATIO_GAP;
-	safe_pages = READ_ONCE(memcg->memory.high) * ratio /
-		     HIGH_ASYNC_RATIO_BASE;
+	if (!is_high_async_reclaim(memcg)) {
+		WRITE_ONCE(memcg->high_async_reclaim, false);
+		return;
+	}
 
-	return (nr_pages > safe_pages) ? (nr_pages - safe_pages) :
-		MEMCG_CHARGE_BATCH;
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_ASYNC_MEMCG_RECLAIM;
+#endif
+	psi_memstall_enter(&pflags);
+	nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages :
+		   MEMCG_CHARGE_BATCH;
+	try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
+	psi_memstall_leave(&pflags);
+	WRITE_ONCE(memcg->high_async_reclaim, false);
 }
 
 static void high_work_func(struct work_struct *work)
 {
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg = container_of(work, struct mem_cgroup,
+						high_work);
 
-	memcg = container_of(work, struct mem_cgroup, high_work);
-	if (memcg->high_async_reclaim)
-		reclaim_high(memcg, get_reclaim_pages(memcg), GFP_KERNEL);
+	if (READ_ONCE(memcg->high_async_reclaim))
+		async_reclaim_high(memcg);
 	else
 		reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
 }
@@ -2653,6 +2651,9 @@ void mem_cgroup_handle_over_high(void)
 	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
 	 * need to account for any ill-begotten jiffies to pay them off later.
 	 */
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_MEMCG_RECLAIM;
+#endif
 	psi_memstall_enter(&pflags);
 	schedule_timeout_killable(penalty_jiffies);
 	psi_memstall_leave(&pflags);
@@ -2723,7 +2724,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		goto nomem;
 
 	memcg_memory_event(mem_over_limit, MEMCG_MAX);
-
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_MEMCG_RECLAIM;
+#endif
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, reclaim_options);
@@ -2825,9 +2828,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			continue;
 		}
 
-		if (is_high_async_reclaim(memcg)) {
+		if (is_high_async_reclaim(memcg) && !mem_high) {
 			WRITE_ONCE(memcg->high_async_reclaim, true);
 			schedule_work(&memcg->high_work);
+			break;
 		}
 
 		if (mem_high || swap_high) {
@@ -5737,7 +5741,7 @@ static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of,
 		return ret;
 
 	if (high_async_ratio >= HIGH_ASYNC_RATIO_BASE ||
-	    high_async_ratio < HIGH_ASYNC_RATIO_DEFAULT)
+	    high_async_ratio < HIGH_ASYNC_RATIO_GAP)
 		return -EINVAL;
 
 	WRITE_ONCE(memcg->high_async_ratio, high_async_ratio);
@@ -6359,7 +6363,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
 	memcg->soft_limit = PAGE_COUNTER_MAX;
-	memcg->high_async_ratio = HIGH_ASYNC_RATIO_DEFAULT;
+	memcg->high_async_ratio = HIGH_ASYNC_RATIO_BASE;
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
 	if (parent) {
 		memcg->swappiness = mem_cgroup_swappiness(parent);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f21365c92a98309024ae4809418f52a15d327098..d2a8ec19315124c18c1214a22f94a0fc1b3f63a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4178,6 +4178,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	if (!order)
 		return NULL;
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_COMPACT;
+#endif
 	psi_memstall_enter(&pflags);
 	noreclaim_flag = memalloc_noreclaim_save();
 
@@ -4447,6 +4450,9 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	unsigned long pflags;
 	bool drained = false;
 
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_GLOBAL_RECLAIM;
+#endif
 	psi_memstall_enter(&pflags);
 	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
 	if (unlikely(!(*did_some_progress)))
diff --git a/mm/page_io.c b/mm/page_io.c
index ee28c39e566e48d5e1e72e723209582367790827..78de95b9ef5aa7e61eb5b4cc3d58b332fb6100ec 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -341,6 +341,9 @@ int swap_readpage(struct page *page, bool synchronous)
 	 * or the submitting cgroup IO-throttled, submission can be a
 	 * significant part of overall IO time.
 	 */
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_SWAP;
+#endif
 	psi_memstall_enter(&pflags);
 
 	if (frontswap_load(page) == 0) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dbd0757dd5a13d021d188c72498e8c5229559499..3d383c7126e3f464554d9a2e2351ad7e6e160450 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3802,7 +3802,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	int i;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
-	unsigned long pflags;
+	unsigned long pflags = 0;
 	unsigned long nr_boost_reclaim;
 	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
 	bool boosted;
@@ -4448,6 +4448,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 					   sc.gfp_mask);
 
 	cond_resched();
+#ifdef CONFIG_PSI_FINE_GRAINED
+	pflags = PSI_GLOBAL_RECLAIM;
+#endif
 	psi_memstall_enter(&pflags);
 	fs_reclaim_acquire(sc.gfp_mask);