From cb5caa3470bf1707710fe7ed36c4f0a91445e2bb Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Wed, 14 Aug 2019 03:11:42 +0800
Subject: [PATCH 1/8] anolis: mm: memcontrol: support background async page
 reclaim

ANBZ: #34087

Currently when memory usage exceeds memory cgroup limit, memory cgroup
just can do sync direct reclaim.  This may incur unexpected stall on
some applications which are sensitive to latency.  Introduce background
async page reclaim mechanism, like what kswapd does.

Define memcg memory usage water mark by introducing wmark_ratio interface,
which is from 0 to 100 and represents percentage of max limit.  The
wmark_high is calculated by (max * wmark_ratio / 100), the wmark_low is
(wmark_high - wmark_high >> 8), which is an empirical value.  If wmark_ratio
is 0, it means water mark is disabled, both wmark_low and wmark_high is max,
which is the default value.

If wmark_ratio is setup, when charging page, if usage is greater than
wmark_high, which means the available memory of memcg is low, a work
would be scheduled to do background page reclaim until memory usage is
reduced to wmark_low if possible.

Define a dedicated unbound workqueue for scheduling water mark reclaim
works.

  [ kun: addjust memcg->wmark_ratio into {READ,WRITE}_ONCE. ]
  [ kun: remove PF_SWAPWRITE according to
    b698f0a1773f7 ("mm/fs: delete PF_SWAPWRITE"). ]
  [ kun: Add setup_memcg_wmark(memcg) in mem_cgroup_css_alloc()
    !parent branch. ]

[Zelin Deng: Make common functions which are going to be used by memcg
v2 be external.]

Reviewed-by: Gavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 .../admin-guide/cgroup-v1/memory.rst          |  20 ++-
 include/linux/memcontrol.h                    |  38 ++++++
 include/linux/page_counter.h                  |   8 ++
 mm/memcontrol-v1.c                            |  35 +++++-
 mm/memcontrol.c                               | 115 ++++++++++++++++++
 mm/page_counter.c                             |  12 ++
 6 files changed, 225 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 29dfd77db68d..eeb1ec5e3704 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -122,6 +122,10 @@ Brief summary of control files.
  memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
                                      This knob is deprecated and shouldn't be
                                      used.
+ memory.wmark_ratio                  set/show water mark ratio
+ memory.wmark_low                    low limit (memory usage low water mark,
+                         read-only)
+ memory.wmark_high                   high limit (memory usge high water mark,
 ==================================== ==========================================
 
 1. History
@@ -963,7 +967,21 @@ Meanwhile, we provide the interface memory.use_prioprity_oom to decide whether t
 enable/disable the feature in each memcg. Write "1" to enable the priority oom and
 "0" to disable it.
 
-13. TODO
+13. Background reclaim
+======================
+
+The user could setup memory usage water mark by echoing a value to
+memory.wmark_ratio.  Valid value is from 0 to 100, which represents percentage
+of max limit.  The wmark_low and wmark_high would be calculated by max limit
+and wmark_ratio.  0 means water mark is disabled, both wmark_low and wmark_high
+would be max, which is the default value.
+
+Once water mark is setup correctly, when charging pages to memcg, if the usage
+exceeds wmark_high, which means available memory is low, a work would be
+scheduled to reclaim pages in background to try to reduce memory usage to
+wmark_low if possible.
+
+14. TODO
 ========
 
 1. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8951111c8418..28e82b11745c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -361,6 +361,9 @@ struct mem_cgroup {
 	spinlock_t event_list_lock;
 #endif /* CONFIG_MEMCG_V1 */
 
+	unsigned int		wmark_ratio;
+	struct work_struct	wmark_work;
+
 #ifdef CONFIG_MEMSLI
 	struct mem_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
 #ifdef CONFIG_MEMCG_V1
@@ -395,6 +398,8 @@ struct mem_cgroup {
 
 extern struct mem_cgroup *root_mem_cgroup;
 
+extern struct workqueue_struct *memcg_wmark_wq;
+
 enum page_memcg_data_flags {
 	/* page->memcg_data is a pointer to an slabobj_ext vector */
 	MEMCG_DATA_OBJEXTS = (1UL << 0),
@@ -1877,6 +1882,18 @@ static inline bool memcg_is_dying(struct mem_cgroup *memcg)
 	return memcg ? css_is_dying(&memcg->css) : false;
 }
 
+static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high)
+{
+	if (high)
+		return page_counter_read(&memcg->memory) < memcg->memory.wmark_high;
+
+	return page_counter_read(&memcg->memory) < memcg->memory.wmark_low;
+}
+
+void setup_memcg_wmark(struct mem_cgroup *memcg);
+int memory_wmark_ratio_show(struct seq_file *m, void *v);
+ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off);
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1952,6 +1969,27 @@ static inline bool memcg_is_dying(struct mem_cgroup *memcg)
 {
 	return false;
 }
+
+static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low)
+{
+	return false;
+}
+
+static inline void setup_memcg_wmark(struct mem_cgroup *memcg)
+{
+}
+
+static inline int memory_wmark_ratio_show(struct seq_file *m, void *v)
+{
+	return 0;
+}
+
+static inline ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
+					       char *buf, size_t nbytes,
+					       loff_t off)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc87..a04f19532239 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -27,6 +27,10 @@ struct page_counter {
 	atomic_long_t low_usage;
 	atomic_long_t children_low_usage;
 
+	/* water mark low and high */
+	unsigned long wmark_low;
+	unsigned long wmark_high;
+
 	unsigned long watermark;
 	/* Latest cg2 reset watermark */
 	unsigned long local_watermark;
@@ -76,6 +80,10 @@ bool page_counter_try_charge(struct page_counter *counter,
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_wmark_high(struct page_counter *counter,
+				 unsigned long nr_pages);
+void page_counter_set_wmark_low(struct page_counter *counter,
+				unsigned long nr_pages);
 
 static inline void page_counter_set_high(struct page_counter *counter,
 					 unsigned long nr_pages)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index fb072008ea4e..7a28c1e1ae4f 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -97,6 +97,8 @@ enum {
 	RES_MAX_USAGE,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
+	WMARK_HIGH_LIMIT,
+	WMARK_LOW_LIMIT,
 };
 
 #ifdef CONFIG_LOCKDEP
@@ -1605,8 +1607,15 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 	} while (true);
 
-	if (!ret && enlarge)
-		memcg1_oom_recover(memcg);
+	if (!ret) {
+		setup_memcg_wmark(memcg);
+
+		if (!is_wmark_ok(memcg, true))
+			queue_work(memcg_wmark_wq, &memcg->wmark_work);
+
+		if (enlarge)
+			memcg1_oom_recover(memcg);
+	}
 
 	return ret;
 }
@@ -1706,6 +1715,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 		return counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
+	case WMARK_HIGH_LIMIT:
+		return (u64)counter->wmark_high * PAGE_SIZE;
+	case WMARK_LOW_LIMIT:
+		return (u64)counter->wmark_low * PAGE_SIZE;
 	default:
 		BUG();
 	}
@@ -2178,6 +2191,24 @@ struct cftype mem_cgroup_legacy_files[] = {
 		.name = "stat",
 		.seq_show = memory_stat_show,
 	},
+	{
+		.name = "wmark_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_ratio_show,
+		.write = memory_wmark_ratio_write,
+	},
+	{
+		.name = "wmark_high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = MEMFILE_PRIVATE(_MEM, WMARK_HIGH_LIMIT),
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "wmark_low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = MEMFILE_PRIVATE(_MEM, WMARK_LOW_LIMIT),
+		.read_u64 = mem_cgroup_read_u64,
+	},
 #ifdef CONFIG_MEMSLI
 	{
 		.name = "direct_reclaim_global_latency",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eb6df08089ae..46d8774e11fa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,6 +111,8 @@ static struct kmem_cache *memcg_pn_cachep;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
 
+struct workqueue_struct *memcg_wmark_wq;
+
 static inline bool task_is_dying(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -2500,6 +2502,37 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 	return 0;
 }
 
+static void reclaim_wmark(struct mem_cgroup *memcg)
+{
+	long nr_pages;
+
+	if (is_wmark_ok(memcg, false))
+		return;
+
+	nr_pages = page_counter_read(&memcg->memory) -
+		   memcg->memory.wmark_low;
+	if (nr_pages <= 0)
+		return;
+
+	nr_pages = max_t(unsigned long, SWAP_CLUSTER_MAX, nr_pages);
+
+	try_to_free_mem_cgroup_pages(memcg, nr_pages,
+				     GFP_KERNEL,
+				     MEMCG_RECLAIM_MAY_SWAP,
+				     NULL);
+}
+
+static void wmark_work_func(struct work_struct *work)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = container_of(work, struct mem_cgroup, wmark_work);
+
+	current->flags |= PF_MEMALLOC;
+	reclaim_wmark(memcg);
+	current->flags &= ~PF_MEMALLOC;
+}
+
 static unsigned long reclaim_high(struct mem_cgroup *memcg,
 				  unsigned int nr_pages,
 				  gfp_t gfp_mask)
@@ -2925,6 +2958,11 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	do {
 		bool mem_high, swap_high;
 
+		if (!is_wmark_ok(memcg, true)) {
+			queue_work(memcg_wmark_wq, &memcg->wmark_work);
+			break;
+		}
+
 		mem_high = page_counter_read(&memcg->memory) >
 			READ_ONCE(memcg->memory.high);
 		swap_high = page_counter_read(&memcg->swap) >
@@ -4242,6 +4280,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 		goto fail;
 
 	INIT_WORK(&memcg->high_work, high_work_func);
+	INIT_WORK(&memcg->wmark_work, wmark_work_func);
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->memory_peaks);
 	INIT_LIST_HEAD(&memcg->swap_peaks);
@@ -4302,6 +4341,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->kmem, &parent->kmem, false);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
 #endif
+		WRITE_ONCE(memcg->wmark_ratio, READ_ONCE(parent->wmark_ratio));
 	} else {
 		init_memcg_stats();
 		init_memcg_events();
@@ -4311,10 +4351,16 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->kmem, NULL, false);
 		page_counter_init(&memcg->tcpmem, NULL, false);
 #endif
+
+		/* initializing memcg wmark */
+		setup_memcg_wmark(memcg);
+
 		root_mem_cgroup = memcg;
 		return &memcg->css;
 	}
 
+	setup_memcg_wmark(memcg);
+
 	if (memcg_on_dfl && !cgroup_memory_nosocket)
 		static_branch_inc(&memcg_sockets_enabled_key);
 
@@ -4410,6 +4456,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 
+	page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
+
 	zswap_memcg_offline_cleanup(memcg);
 
 	memcg_offline_kmem(memcg);
@@ -4457,6 +4506,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 
 	vmpressure_cleanup(&memcg->vmpressure);
 	cancel_work_sync(&memcg->high_work);
+	cancel_work_sync(&memcg->wmark_work);
 	memcg1_remove_from_trees(memcg);
 	free_shrinker_info(memcg);
 	mem_cgroup_free(memcg);
@@ -4488,6 +4538,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
 	memcg1_soft_limit_reset(memcg);
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
 	memcg_wb_domain_size_changed(memcg);
@@ -4706,6 +4758,62 @@ static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
 #endif /* CONFIG_LRU_GEN */
 
+void setup_memcg_wmark(struct mem_cgroup *memcg)
+{
+	unsigned long high_wmark;
+	unsigned long low_wmark;
+	unsigned long max = memcg->memory.max;
+	unsigned int wmark_ratio = memcg->wmark_ratio;
+
+	if (wmark_ratio) {
+		high_wmark = (max * wmark_ratio) / 100;
+		low_wmark = high_wmark - (high_wmark >> 8);
+
+		page_counter_set_wmark_low(&memcg->memory, low_wmark);
+		page_counter_set_wmark_high(&memcg->memory, high_wmark);
+	} else {
+		page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
+		page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
+	}
+}
+
+int memory_wmark_ratio_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned int wmark_ratio = READ_ONCE(memcg->wmark_ratio);
+
+	seq_printf(m, "%d\n", wmark_ratio);
+
+	return 0;
+}
+
+ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, wmark_ratio;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtouint(buf, 0, &wmark_ratio);
+	if (ret)
+		return ret;
+
+	if (wmark_ratio > 100)
+		return -EINVAL;
+
+	xchg(&memcg->wmark_ratio, wmark_ratio);
+
+	setup_memcg_wmark(memcg);
+
+	if (!is_wmark_ok(memcg, true))
+		queue_work(memcg_wmark_wq, &memcg->wmark_work);
+
+	return nbytes;
+}
+
 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
@@ -5773,6 +5881,13 @@ int __init mem_cgroup_init(void)
 	 */
 	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
 
+	memcg_wmark_wq = alloc_workqueue("memcg_wmark", WQ_MEM_RECLAIM |
+				WQ_UNBOUND | WQ_FREEZABLE,
+				WQ_UNBOUND_MAX_ACTIVE);
+
+	if (!memcg_wmark_wq)
+		return -ENOMEM;
+
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 				  memcg_hotplug_cpu_dead);
 
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127..95baabc0dc36 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -260,6 +260,18 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
 		propagate_protected_usage(c, atomic_long_read(&c->usage));
 }
 
+void page_counter_set_wmark_high(struct page_counter *counter,
+				 unsigned long nr_pages)
+{
+	xchg(&counter->wmark_high, nr_pages);
+}
+
+void page_counter_set_wmark_low(struct page_counter *counter,
+				unsigned long nr_pages)
+{
+	xchg(&counter->wmark_low, nr_pages);
+}
+
 /**
  * page_counter_memparse - memparse() for page counter limits
  * @buf: string to parse
-- 
Gitee


From 95ed81700b7b282b08b1945bded4132926fc5541 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Wed, 14 Aug 2019 05:45:18 +0800
Subject: [PATCH 2/8] anolis: mm: memcontrol: add background reclaim support
 for cgroupv2

ANBZ: #34087

Like v1, add background reclaim support for cgroup v2.  The interfaces
are exactly same with v1.  However, if high limit is setup for v2, the
water mark would be calculated by high limit instead of max limit.

[Zelin Deng: see upstream commit c8e6002bd611c, non-blocking write also
changes the limites, should update watermark anyway.]

Reviewed-by: Gavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 32 ++++++++++++++
 mm/memcontrol.c                         | 55 ++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8ad0b2781317..bac83711dbc9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1447,6 +1447,38 @@ The following nested keys are defined.
 	current memory usage for subsequent reads through the same
 	file descriptor.
 
+  memory.wmark_ratio
+        A read-write single value file which exists on non-root
+        cgroups.  The default is 0.
+
+        Memory usage water mark.  Valid value is from 0 to 100, which
+        represents percentage of max limit or high limit if high is setup.
+        The wmark_low and wmark_high would be calculated by max limit and
+        wmark_ratio.  0 means water mark is disabled, both wmark_low and
+        wmark_high would be max, which is the default value.
+
+        Once water mark is setup correctly, when charging pages to memcg,
+        if the usage exceeds wmark_high, which means available memory is low,
+        a work would be scheduled to reclaim pages in background to try to
+        reduce memory usage to wmark_low if possible.
+
+        If memory.low is greater than memory.wmark_high, back ground reclaim
+        may not take effect at all due to low protection.
+
+ memory.wmark_high
+        A read-only single value file which exists on non-root cgroups.
+        The default is max.
+
+        Memory usage high water mark, which means the available memory is low.
+        For details, please refer to the above wmark_ratio section.
+
+ memory.wmark_low
+        A read-only single value file which exists on non-root cgroups.
+        The default is max.
+
+        Memory usage low water mark, which means the available memory is ok.
+        For details, please refer to the above wmark_ratio section.
+
   memory.oom.group
 	A read-write single value file which exists on non-root
 	cgroups.  The default value is "0".
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46d8774e11fa..126f8d2ea163 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4762,7 +4762,8 @@ void setup_memcg_wmark(struct mem_cgroup *memcg)
 {
 	unsigned long high_wmark;
 	unsigned long low_wmark;
-	unsigned long max = memcg->memory.max;
+	unsigned long max = memcg->memory.high > memcg->memory.max ?
+			    memcg->memory.max : memcg->memory.high;
 	unsigned int wmark_ratio = memcg->wmark_ratio;
 
 	if (wmark_ratio) {
@@ -5028,10 +5029,41 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 			break;
 	}
 out:
+	setup_memcg_wmark(memcg);
+
+	if (!is_wmark_ok(memcg, true))
+		queue_work(memcg_wmark_wq, &memcg->wmark_work);
+
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
 
+static int memory_wmark_low_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long wmark_low = READ_ONCE(memcg->memory.wmark_low);
+
+	if (wmark_low == PAGE_COUNTER_MAX)
+		seq_puts(m, "max\n");
+	else
+		seq_printf(m, "%llu\n", (u64)wmark_low * PAGE_SIZE);
+
+	return 0;
+}
+
+static int memory_wmark_high_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long wmark_high = READ_ONCE(memcg->memory.wmark_high);
+
+	if (wmark_high == PAGE_COUNTER_MAX)
+		seq_puts(m, "max\n");
+	else
+		seq_printf(m, "%llu\n", (u64)wmark_high * PAGE_SIZE);
+
+	return 0;
+}
+
 static int memory_max_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -5085,6 +5117,11 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 		cond_resched();
 	}
 out:
+	setup_memcg_wmark(memcg);
+
+	if (!is_wmark_ok(memcg, true))
+		queue_work(memcg_wmark_wq, &memcg->wmark_work);
+
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
@@ -5269,6 +5306,22 @@ static struct cftype memory_files[] = {
 		.write_u64 = mem_cgroup_priority_oom_write,
 		.read_u64 = mem_cgroup_priority_oom_read,
 	},
+	{
+		.name = "wmark_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_ratio_show,
+		.write = memory_wmark_ratio_write,
+	},
+	{
+		.name = "wmark_high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_high_show,
+	},
+	{
+		.name = "wmark_low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_low_show,
+	},
 	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
-- 
Gitee


From 52244e057091ce5f64a5fb251a2b37ad572ccd35 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Fri, 2 Aug 2019 02:01:40 +0800
Subject: [PATCH 3/8] anolis: mm: memcontrol: treat memcg wmark reclaim work as
 kswapd

ANBZ: #34087

Since background water mark reclaim is scheduled by workqueue, it could
do more work than direct reclaim, i.e. write out dirty page, etc.

So, add PF_KSWAPD flag, so that current_is_kswapd() would return true
for memcg background reclaim.  The condition "current_is_kswapd() &&
!global_reclaim(sc)" is good enough to tell current is global kswapd or
memcg background reclaim.

And, kswapd is not allowed to break memory.low protection for now, memcg
kswapd should not break it either.

[Zelin Deng: Partial shrink returns first rather than async reclaim.]

Reviewed-by: Gavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 mm/memcontrol.c |  4 ++--
 mm/vmscan.c     | 24 +++++++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 126f8d2ea163..4110b7de854f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2528,9 +2528,9 @@ static void wmark_work_func(struct work_struct *work)
 
 	memcg = container_of(work, struct mem_cgroup, wmark_work);
 
-	current->flags |= PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC | PF_KSWAPD;
 	reclaim_wmark(memcg);
-	current->flags &= ~PF_MEMALLOC;
+	current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
 }
 
 static unsigned long reclaim_high(struct mem_cgroup *memcg,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2f7330b43139..1974b16dad4d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6149,6 +6149,16 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			mem_cgroup_iter_break(target_memcg, memcg);
 			break;
 		}
+
+		/*
+		 * Memcg background reclaim would break iter once water
+		 * mark is satisfied.
+		 */
+		if (cgroup_reclaim(sc) && current_is_kswapd() &&
+		     is_wmark_ok(target_memcg, false)) {
+			mem_cgroup_iter_break(target_memcg, memcg);
+			break;
+		}
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
 }
 
@@ -6191,7 +6201,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	if (nr_node_reclaimed)
 		reclaimable = true;
 
-	if (current_is_kswapd()) {
+	if (current_is_kswapd() && !cgroup_reclaim(sc)) {
 		/*
 		 * If reclaim is isolating dirty pages under writeback,
 		 * it implies that the long-lived page allocation rate
@@ -6474,6 +6484,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
 	do {
+		if (current_is_kswapd() && cgroup_reclaim(sc) &&
+		    is_wmark_ok(sc->target_mem_cgroup, false))
+			break;
+
 		if (!sc->proactive)
 			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 					sc->priority);
@@ -6545,8 +6559,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		goto retry;
 	}
 
-	/* Untapped cgroup reserves?  Don't OOM, retry. */
-	if (sc->memcg_low_skipped) {
+	/*
+	 * Untapped cgroup reserves?  Don't OOM, retry.
+	 *
+	 * Memcg kswapd should not break low protection.
+	 */
+	if (sc->memcg_low_skipped && !current_is_kswapd()) {
 		sc->priority = initial_priority;
 		sc->force_deactivate = 0;
 		sc->memcg_low_reclaim = 1;
-- 
Gitee


From 987b93d20de7a9cb00466c16df6fb77e89a5cdd7 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Fri, 2 Aug 2019 11:47:44 +0800
Subject: [PATCH 4/8] anolis: mm: vmscan: make memcg kswapd set memcg state to
 dirty or writeback

ANBZ: #34087

The global kswapd could set memory node to dirty or writeback if current
scan find all pages are unqueued dirty or writeback.  Then kswapd would
write out dirty pages or wait for writeback done.  The memcg kswapd
behaves like global kswapd, and it should set dirty or writeback state
to memcg too if the same condition is met.

Since direct reclaim can't write out page caches, the system depends on
kswapd to write out dirty pages if scan finds too many dirty pages in
order to avoid pre-mature OOM.  But, if page cache is dirtied too fast,
writing out pages definitely can't catch up with dirtying pages.  It is
the responsibility of dirty page balance to throttle dirtying pages.

[Zelin Deng: dirty flag has been removed, adapt for it.]

Reviewed-by: Gavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 include/linux/mmzone.h |  6 +++---
 mm/vmscan.c            | 17 +++++++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 03c5cd62dcb8..1998250938f0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,6 +435,9 @@ enum lruvec_flags {
 	 */
 	LRUVEC_CGROUP_CONGESTED,
 	LRUVEC_NODE_CONGESTED,
+	LRUVEC_WRITEBACK,		/* reclaim scanning has recently found
+					 * many pages under writeback
+					 */
 };
 
 #endif /* !__GENERATING_BOUNDS_H */
@@ -1161,9 +1164,6 @@ struct zone {
 } ____cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
-	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
-					 * many pages under writeback
-					 */
 	PGDAT_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 };
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1974b16dad4d..e09f5f6669dc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1066,6 +1066,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	u64 start = 0;
 	bool do_demote_pass;
 	struct swap_iocb *plug = NULL;
+	struct lruvec *target_lruvec;
+
+	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
 	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
@@ -1189,7 +1192,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 			/* Case 1 above */
 			if (current_is_kswapd() &&
 			    folio_test_reclaim(folio) &&
-			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
+			    test_bit(LRUVEC_WRITEBACK, &target_lruvec->flags)) {
 				stat->nr_immediate += nr_pages;
 				goto activate_locked;
 
@@ -6201,7 +6204,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	if (nr_node_reclaimed)
 		reclaimable = true;
 
-	if (current_is_kswapd() && !cgroup_reclaim(sc)) {
+	if (current_is_kswapd()) {
 		/*
 		 * If reclaim is isolating dirty pages under writeback,
 		 * it implies that the long-lived page allocation rate
@@ -6214,13 +6217,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		 * the dirtying process is throttled in the same way
 		 * balance_dirty_pages() manages.
 		 *
-		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+		 * Once a node is flagged LRUVEC_WRITEBACK, kswapd will
 		 * count the number of pages under pages flagged for
 		 * immediate reclaim and stall if any are encountered
 		 * in the nr_immediate check below.
 		 */
 		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
-			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+			set_bit(LRUVEC_WRITEBACK, &target_lruvec->flags);
 
 		/*
 		 * If kswapd scans pages marked for immediate
@@ -6244,7 +6247,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
 			set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
 
-		if (current_is_kswapd())
+		if (current_is_kswapd() && !cgroup_reclaim(sc))
 			set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
 	}
 
@@ -6516,6 +6519,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
 						   zone->zone_pgdat);
 			clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
+			if (current_is_kswapd())
+				clear_bit(LRUVEC_WRITEBACK, &lruvec->flags);
 		}
 	}
 
@@ -6967,7 +6972,7 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
 
 	clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
 	clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
-	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
+	clear_bit(LRUVEC_WRITEBACK, &lruvec->flags);
 }
 
 /*
-- 
Gitee


From 9ac6b5251a7d8d92daac4e320c85fda720cf924b Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Sat, 17 Aug 2019 08:04:03 +0800
Subject: [PATCH 5/8] anolis: mm: memcontrol: make distance between wmark_low
 and wmark_high configurable

ANBZ: #34087

Introduce a new interface, wmark_scale_factor, which defines the
distance between wmark_high and wmark_low.  The unit is in fractions of
10,000. The default value of 50 means the distance between wmark_high
and wmark_low is 0.5% of the max limit of the cgroup.  The maximum value
is 1000, or 10% of the max limit.

The distance between wmark_low and wmark_high have impact on how hard
memcg kswapd would reclaim.

Reviewed-by: Gavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 .../admin-guide/cgroup-v1/memory.rst          |  3 +
 Documentation/admin-guide/cgroup-v2.rst       |  9 +++
 include/linux/memcontrol.h                    | 16 ++++++
 mm/memcontrol-v1.c                            |  6 ++
 mm/memcontrol.c                               | 55 ++++++++++++++++++-
 5 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index eeb1ec5e3704..f65efbc7619e 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -126,6 +126,9 @@ Brief summary of control files.
  memory.wmark_low                    low limit (memory usage low water mark,
                          read-only)
  memory.wmark_high                   high limit (memory usge high water mark,
+ memory.wmark_scale_factor          the gap between wmark_low and wmark_high,
+                  percentage of max limit, default is 50 or 0.5% of max limit.
+                  The max value is 1000 or 10% of max limit.
 ==================================== ==========================================
 
 1. History
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index bac83711dbc9..d54681134323 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1479,6 +1479,15 @@ The following nested keys are defined.
         Memory usage low water mark, which means the available memory is ok.
         For details, please refer to the above wmark_ratio section.
 
+  memory.wmark_scale_factor
+        A read-write single value file which exists on non-root cgroups.
+        The default is 50.
+
+        The gap between wmark_low and wmark_high.  The unit is in fractions
+        of 10,000. The default value of 50 means the distance between wmark_high
+        and wmark_low is 0.5% of the max limit of the cgroup.  The maximum value
+        is 1000, or 10% of max limit.
+
   memory.oom.group
 	A read-write single value file which exists on non-root
 	cgroups.  The default value is "0".
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 28e82b11745c..711f7394b0f7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -363,6 +363,7 @@ struct mem_cgroup {
 
 	unsigned int		wmark_ratio;
 	struct work_struct	wmark_work;
+	unsigned int		wmark_scale_factor;
 
 #ifdef CONFIG_MEMSLI
 	struct mem_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
@@ -1894,6 +1895,9 @@ void setup_memcg_wmark(struct mem_cgroup *memcg);
 int memory_wmark_ratio_show(struct seq_file *m, void *v);
 ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
 				 char *buf, size_t nbytes, loff_t off);
+int memory_wmark_scale_factor_show(struct seq_file *m, void *v);
+ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
+					char *buf, size_t nbytes, loff_t off);
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1990,6 +1994,18 @@ static inline ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
 {
 	return 0;
 }
+
+static inline int memory_wmark_scale_factor_show(struct seq_file *m, void *v)
+{
+	return 0;
+}
+
+static inline ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
+						      char *buf,
+						      size_t nbytes, loff_t off)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 7a28c1e1ae4f..403cecfd1d0f 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -2209,6 +2209,12 @@ struct cftype mem_cgroup_legacy_files[] = {
 		.private = MEMFILE_PRIVATE(_MEM, WMARK_LOW_LIMIT),
 		.read_u64 = mem_cgroup_read_u64,
 	},
+	{
+		.name = "wmark_scale_factor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_scale_factor_show,
+		.write = memory_wmark_scale_factor_write,
+	},
 #ifdef CONFIG_MEMSLI
 	{
 		.name = "direct_reclaim_global_latency",
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4110b7de854f..4b3b2f1ba342 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4342,6 +4342,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
 #endif
 		WRITE_ONCE(memcg->wmark_ratio, READ_ONCE(parent->wmark_ratio));
+		/* Default gap is 0.5% max limit */
+		memcg->wmark_scale_factor = parent->wmark_scale_factor ?
+					    : 50;
 	} else {
 		init_memcg_stats();
 		init_memcg_events();
@@ -4765,10 +4768,18 @@ void setup_memcg_wmark(struct mem_cgroup *memcg)
 	unsigned long max = memcg->memory.high > memcg->memory.max ?
 			    memcg->memory.max : memcg->memory.high;
 	unsigned int wmark_ratio = memcg->wmark_ratio;
+	unsigned int wmark_scale_factor = memcg->wmark_scale_factor;
+	unsigned long gap;
 
 	if (wmark_ratio) {
 		high_wmark = (max * wmark_ratio) / 100;
-		low_wmark = high_wmark - (high_wmark >> 8);
+
+		/*
+		 * Set the memcg watermark distance according to the
+		 * scale factor in proportion to max limit.
+		 */
+		gap = mult_frac(max, wmark_scale_factor, 10000);
+		low_wmark = high_wmark - gap;
 
 		page_counter_set_wmark_low(&memcg->memory, low_wmark);
 		page_counter_set_wmark_high(&memcg->memory, high_wmark);
@@ -4815,6 +4826,42 @@ ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+int memory_wmark_scale_factor_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned int wmark_scale_factor;
+
+	wmark_scale_factor = READ_ONCE(memcg->wmark_scale_factor);
+
+	seq_printf(m, "%d\n", wmark_scale_factor);
+
+	return 0;
+}
+
+ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
+					char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, wmark_scale_factor;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtouint(buf, 0, &wmark_scale_factor);
+	if (ret)
+		return ret;
+
+	if (wmark_scale_factor > 1000 || wmark_scale_factor < 1)
+		return -EINVAL;
+
+	xchg(&memcg->wmark_scale_factor, wmark_scale_factor);
+
+	setup_memcg_wmark(memcg);
+
+	return nbytes;
+}
+
 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
@@ -5322,6 +5369,12 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = memory_wmark_low_show,
 	},
+	{
+		.name = "wmark_scale_factor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_scale_factor_show,
+		.write = memory_wmark_scale_factor_write,
+	},
 	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
-- 
Gitee


From 0593bbbaebef9fd4f5ece7653fdb1a2af3580e76 Mon Sep 17 00:00:00 2001
From: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Date: Mon, 1 Feb 2021 12:00:23 +0800
Subject: [PATCH 6/8] anolis: mm: count the memory pressure when wmark meets.

ANBZ: #34087

It will reclaim the memory since we introduce the memcg kswapd,
hence the memory pressure should be counted.

Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 mm/memcontrol.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b3b2f1ba342..1f7ae58368e5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2505,6 +2505,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 static void reclaim_wmark(struct mem_cgroup *memcg)
 {
 	long nr_pages;
+	unsigned long pflags;
 
 	if (is_wmark_ok(memcg, false))
 		return;
@@ -2516,10 +2517,12 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
 
 	nr_pages = max_t(unsigned long, SWAP_CLUSTER_MAX, nr_pages);
 
+	psi_memstall_enter(&pflags);
 	try_to_free_mem_cgroup_pages(memcg, nr_pages,
 				     GFP_KERNEL,
 				     MEMCG_RECLAIM_MAY_SWAP,
 				     NULL);
+	psi_memstall_leave(&pflags);
 }
 
 static void wmark_work_func(struct work_struct *work)
-- 
Gitee


From bf7232792cd2ecfb04851537a4ed6b0640313338 Mon Sep 17 00:00:00 2001
From: Xu Yu <xuyu@linux.alibaba.com>
Date: Mon, 1 Feb 2021 12:00:23 +0800
Subject: [PATCH 7/8] anolis: mm,memcg: record latency of memcg wmark reclaim

ANBZ: #34087

The memcg background async page reclaim, a.k.a, memcg kswapd, is
implemented with a dedicated unbound workqueue in 4.19, eliminating
original kthreads.

However, memcg kswapd will run too frequently, resulting in high
overhead, page cache thrashing, frequent dirty page writeback, etc., due
to improper memcg memory.wmark_ratio, unreasonable memcg memor capacity,
or even abnormal memcg memory usage.

We need to find out the problematic memcg(s) where memcg kswapd
introduces significant overhead.

This records the latency of each run of memcg kswapd work, and then
aggregates into the exstat of per memcg.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4154
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 include/linux/memcontrol.h | 19 +++++++++++++++
 mm/memcontrol-v1.c         |  4 ++++
 mm/memcontrol.c            | 47 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 711f7394b0f7..ef14f295653f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -46,6 +46,16 @@ enum memcg_stat_item {
 	MEMCG_NR_STAT,
 };
 
+enum memcg_exstat_item {
+	MEMCG_WMARK_RECLAIM,
+	MEMCG_NR_EXSTAT,
+};
+
+/* Only care about 64bit using "long" */
+struct mem_cgroup_exstat_cpu {
+	unsigned long item[MEMCG_NR_EXSTAT];
+};
+
 enum memcg_memory_event {
 	MEMCG_LOW,
 	MEMCG_HIGH,
@@ -361,6 +371,9 @@ struct mem_cgroup {
 	spinlock_t event_list_lock;
 #endif /* CONFIG_MEMCG_V1 */
 
+	/* memory.exstat */
+	struct mem_cgroup_exstat_cpu __percpu *exstat_cpu;
+
 	unsigned int		wmark_ratio;
 	struct work_struct	wmark_work;
 	unsigned int		wmark_scale_factor;
@@ -1898,6 +1911,7 @@ ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
 int memory_wmark_scale_factor_show(struct seq_file *m, void *v);
 ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
 					char *buf, size_t nbytes, loff_t off);
+int memcg_exstat_show(struct seq_file *m, void *v);
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -2006,6 +2020,11 @@ static inline ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *o
 {
 	return 0;
 }
+
+static inline int memcg_exstat_show(struct seq_file *m, void *v)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 403cecfd1d0f..a9c5b92975f0 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -2191,6 +2191,10 @@ struct cftype mem_cgroup_legacy_files[] = {
 		.name = "stat",
 		.seq_show = memory_stat_show,
 	},
+	{
+		.name = "exstat",
+		.seq_show = memcg_exstat_show,
+	},
 	{
 		.name = "wmark_ratio",
 		.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1f7ae58368e5..707089f8afa6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2506,6 +2506,8 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
 {
 	long nr_pages;
 	unsigned long pflags;
+	struct mem_cgroup *iter;
+	u64 start, duration;
 
 	if (is_wmark_ok(memcg, false))
 		return;
@@ -2517,12 +2519,26 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
 
 	nr_pages = max_t(unsigned long, SWAP_CLUSTER_MAX, nr_pages);
 
+	/*
+	 * Typically, we would like to record the actual cpu% of reclaim_wmark
+	 * work, excluding any sleep/resched time.  However, currently we just
+	 * simply record the whole duration of reclaim_wmark work for the
+	 * overhead-accuracy trade-off.
+	 */
+	start = ktime_get_ns();
 	psi_memstall_enter(&pflags);
 	try_to_free_mem_cgroup_pages(memcg, nr_pages,
 				     GFP_KERNEL,
 				     MEMCG_RECLAIM_MAY_SWAP,
 				     NULL);
 	psi_memstall_leave(&pflags);
+	duration = ktime_get_ns() - start;
+
+	css_get(&memcg->css);
+	for (iter = memcg; iter; iter = parent_mem_cgroup(iter))
+		this_cpu_add(iter->exstat_cpu->item[MEMCG_WMARK_RECLAIM],
+			     duration);
+	css_put(&memcg->css);
 }
 
 static void wmark_work_func(struct work_struct *work)
@@ -2536,6 +2552,28 @@ static void wmark_work_func(struct work_struct *work)
 	current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
 }
 
+static u64 memcg_exstat_gather(struct mem_cgroup *memcg,
+			       enum memcg_exstat_item idx)
+{
+	u64 sum = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		sum += per_cpu_ptr(memcg->exstat_cpu, cpu)->item[idx];
+
+	return sum;
+}
+
+int memcg_exstat_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "wmark_reclaim_work_ms %llu\n",
+		   memcg_exstat_gather(memcg, MEMCG_WMARK_RECLAIM) >> 20);
+
+	return 0;
+}
+
 static unsigned long reclaim_high(struct mem_cgroup *memcg,
 				  unsigned int nr_pages,
 				  gfp_t gfp_mask)
@@ -4211,6 +4249,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	memcg1_free_events(memcg);
 	kfree(memcg->vmstats);
 	free_percpu(memcg->vmstats_percpu);
+	free_percpu(memcg->exstat_cpu);
 #ifdef CONFIG_MEMSLI
 	free_percpu(memcg->lat_stat_cpu);
 #endif
@@ -4252,6 +4291,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	if (!memcg->vmstats_percpu)
 		goto fail;
 
+	memcg->exstat_cpu = alloc_percpu(struct mem_cgroup_exstat_cpu);
+	if (!memcg->exstat_cpu)
+		goto fail;
+
 #ifdef CONFIG_MEMSLI
 	memcg->lat_stat_cpu = alloc_percpu_gfp(struct mem_cgroup_lat_stat_cpu,
 					       GFP_KERNEL_ACCOUNT);
@@ -5438,6 +5481,10 @@ static struct cftype memory_files[] = {
 		.seq_show =  memcg_lat_stat_show,
 	},
 #endif /* CONFIG_MEMSLI */
+	{
+		.name = "exstat",
+		.seq_show = memcg_exstat_show,
+	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
-- 
Gitee


From fad216346524fccac88261a998875055540e6346 Mon Sep 17 00:00:00 2001
From: Wenwu Hou <hwenwur@gmail.com>
Date: Thu, 18 Dec 2025 20:51:03 +0800
Subject: [PATCH 8/8] anolis: mm: vmscan: Fix memleak in cgroup wmark reclaim
 when MGLRU is enabled

ANBZ: #34087
ANBZ: #28271

A memory leak occurs in cgroup wmark reclaim when Multi-Gen LRU
(MGLRU) is enabled. The leak originates from the following call path:

  kmalloc_trace+124
  set_mm_walk+100
  try_to_inc_max_seq.isra.0+82
  try_to_shrink_lruvec+553
  lru_gen_shrink_lruvec+71
  shrink_node_memcgs+386
  shrink_node+390
  shrink_zones.constprop.0+133
  do_try_to_free_pages+157
  try_to_free_mem_cgroup_pages+263
  wmark_work_func+153
  process_one_work+397
  worker_thread+631
  kthread+228
  ret_from_fork+48
  ret_from_fork_asm+27

Root cause:
MGLRU uses different allocation strategies for the `walk` structure:
- Direct reclaim: allocates via kzalloc() in set_mm_walk(), frees in
  clear_mm_walk()
- kswapd context: uses static allocation, no free needed

Cgroup wmark reclaim invokes direct reclaim from a kswapd context,
causing set_mm_walk() to allocate memory via kzalloc() while
clear_mm_walk() skips freeing when current_is_kswapd() is true.

Fix:
Prevent set_mm_walk() from calling kzalloc() when in kswapd context,
matching the allocation strategy expected by clear_mm_walk().

Side effect:
With this fix, MGLRU generation aging will fall back to rmap-based
scanning in cgroup wmark reclaim contexts, which may have a minor
performance impact but prevents the memory leak.

Benchmark:
Following ac35a4902374 ("mm: multi-gen LRU: minimal implementation"),
there is no significant regression introduced by this patch.
  fio (buffered I/O): [-1.05, +1.07]%
                     IOPS        BW
    Before patch:    287.8k      10.98GiB/s
    After patch:     290.9k      11.09GiB/s

Fixes: 5dc304916bc2 ("anolis: mm: memcontrol: treat memcg wmark reclaim work as kswapd")
Signed-off-by: Wenwu Hou <hwenwur@gmail.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/6224
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
---
 mm/vmscan.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e09f5f6669dc..fbc7dc8d0c1f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3809,12 +3809,9 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force
 		VM_WARN_ON_ONCE(walk);
 
 		walk = &pgdat->mm_walk;
-	} else if (!walk && force_alloc) {
-		VM_WARN_ON_ONCE(current_is_kswapd());
-
+	} else if (!walk && force_alloc && !current_is_kswapd())
 		walk = kzalloc_obj(*walk,
 				   __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	}
 
 	current->reclaim_state->mm_walk = walk;
 
-- 
Gitee