From 5204214f2ee8f0361ec88fa000f6d0343bf0d7d7 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 22 Dec 2023 09:24:50 +0000 Subject: [PATCH 1/4] cgroup: check if cgroup root is alive in cgroupstats_show() euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: N/A ------------------------------------------------- If a cgroup root is dying, show its hierarchy_id and num_cgroups as 0. Signed-off-by: Zefan Li Signed-off-by: chenridong --- kernel/cgroup/cgroup-v1.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 134a15e1d83a..622398d492b6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -675,6 +675,7 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; + bool dead; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* @@ -682,11 +683,13 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + dead = percpu_ref_is_dying(&ss->root->cgrp.self.refcnt); seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), + ss->legacy_name, dead ? 0 : ss->root->hierarchy_id, + dead ? 0 : atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } return 0; } -- Gitee From 53e871a365e495e13fb06b8695e132615410ba40 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 22 Dec 2023 09:24:51 +0000 Subject: [PATCH 2/4] cgroup: wait for cgroup destruction to complete when umount hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: N/A ------------------------------------------------- Since commit 3c606d35fe97 ("cgroup: prevent mount hang due to memory controller lifetime"), a cgroup root won't be destroyed if there are any child cgroups, dead or alive. This introduced a small regression. # cat test.sh mount -t cgroup -o cpuset xxx /cgroup mkdir /cgroup/tmp rmdir /cgroup/tmp umount /cgroup After running this script, you'll probably find the cgroup hierarchy is still active. # cat /proc/cgroups | grep cpuset #subsys_name hierarchy num_cgroups enabled cpuset 1 1 1 ... Fix this by waiting for a while when umount. Now run the script again and you'll see: # cat /proc/cgroups | grep cpuset #subsys_name hierarchy num_cgroups enabled cpuset 0 1 1 ... Cc: stable@vger.kernel.org # 3.19+ Signed-off-by: Zefan Li Signed-off-by: chenridong --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup/cgroup.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..a864bcba7147 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -556,6 +556,9 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + /* A list running through the active hierarchies */ struct list_head root_list; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 41b16ce99f54..e0304d9bc338 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2018,6 +2018,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); + init_waitqueue_head(&root->wait); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; @@ -2254,6 +2255,17 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); + /* + * Wait if there are cgroups being destroyed, because the destruction + * is asynchronous. On the other hand some controllers like memcg + * may pin cgroups for a very long time, so don't wait forever. + */ + if (root != &cgrp_dfl_root) { + wait_event_timeout(root->wait, + list_empty(&root->cgrp.self.children), + msecs_to_jiffies(500)); + } + /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). @@ -5445,6 +5457,9 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (css->parent && !css->parent->parent && + list_empty(&css->parent->children)) + wake_up(&cgrp->root->wait); } cgroup_unlock(); -- Gitee From d62e6d48028cee20e337182e278553898db5913d Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Fri, 22 Dec 2023 09:24:52 +0000 Subject: [PATCH 3/4] cgroup: Return ERSCH when add Z process into task hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: NA -------------------------- When echo a Z process into tasks, it should return -ERSCH instead of 0. Signed-off-by: Lu Jialin Signed-off-by: Zheng Zengkai Signed-off-by: chenridong --- kernel/cgroup/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e0304d9bc338..f1ae39e38755 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2818,6 +2818,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; + int err = 0; /* * The following thread iteration should be inside an RCU critical @@ -2828,12 +2829,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, task = leader; do { cgroup_migrate_add_task(task, mgctx); - if (!threadgroup) + if (!threadgroup) { + if (task->flags & PF_EXITING) + err = -ESRCH; break; + } } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx); + return err ? err : cgroup_migrate_execute(mgctx); } /** -- Gitee From 4a57426e875098fb12680a0ddf76c3be8df82aec Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 22 Dec 2023 09:24:53 +0000 Subject: [PATCH 4/4] cgroup: disable kernel memory accounting for all memory cgroups by default hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: NA ---------------------------------------- The kernel memory accounting for all memory cgroups is not stable, and it will cause a 100% regression in hackbench compared with kernel-4.19, so disable it by default. We can use the following command line to enable or disable it: cgroup.memory=kmem or cgroup.memory=kmem. Signed-off-by: Yang Yingliang Signed-off-by: Zheng Zengkai Signed-off-by: chenridong --- Documentation/admin-guide/cgroup-v1/memory.rst | 6 +++--- Documentation/admin-guide/kernel-parameters.txt | 1 + mm/memcontrol.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ff456871bf4b..edd795855d68 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -320,9 +320,9 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it possible to DoS the system by consuming too much of this precious resource. -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. +Kernel memory accounting is disabled for all memory cgroups by default. But +it can be enabled system-wide by passing cgroup.memory=kmem to the kernel +at boot time. In this case, kernel memory will all be accounted. Kernel memory limits are not imposed for the root cgroup. Usage for the root cgroup may or may not be accounted. The memory used is accumulated into diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358..022a06068306 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -584,6 +584,7 @@ Format: nosocket -- Disable socket memory accounting. nokmem -- Disable kernel memory accounting. + kmem -- Enable kernel memory accounting. nobpf -- Disable BPF memory accounting. checkreqprot= [SELINUX] Set initial checkreqprot flag value. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d9a873e5522..a16ea4fa154a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,7 +87,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init = true; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; @@ -7563,6 +7563,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nosocket = true; if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; + else if (!strcmp(token, "kmem")) + cgroup_memory_nokmem = false; if (!strcmp(token, "nobpf")) cgroup_memory_nobpf = true; } -- Gitee