From 0fc920124ec498f5aa23297f06ccb8c25eaad58f Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 22 Dec 2023 09:24:50 +0000 Subject: [PATCH 1/4] cgroup: check if cgroup root is alive in cgroupstats_show() euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: N/A ------------------------------------------------- If a cgroup root is dying, show its hierarchy_id and num_cgroups as 0. Signed-off-by: Zefan Li Signed-off-by: chenridong --- kernel/cgroup/cgroup-v1.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 134a15e1d83a..f4d9745a48d9 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -675,6 +675,7 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; + bool dead; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* @@ -682,11 +683,13 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + dead = percpu_ref_is_dying(&ss->root->cgrp.self.refcnt); seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); + ss->legacy_name, dead ? 0 : ss->root->hierarchy_id, + dead ? 0 : atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + } return 0; } -- Gitee From c314228a77f0938d4bfa9585ccb00c176b469dc5 Mon Sep 17 00:00:00 2001 From: chenridong Date: Fri, 22 Dec 2023 09:24:51 +0000 Subject: [PATCH 2/4] cgroup: wait for cgroup destruction to complete when umount hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: N/A ------------------------------------------------- Since commit 3c606d35fe97 ("cgroup: prevent mount hang due to memory controller lifetime"), a cgroup root won't be destroyed if there are any child cgroups, dead or alive. This introduced a small regression. # cat test.sh mount -t cgroup -o cpuset xxx /cgroup mkdir /cgroup/tmp rmdir /cgroup/tmp umount /cgroup After running this script, you'll probably find the cgroup hierarchy is still active. # cat /proc/cgroups | grep cpuset #subsys_name hierarchy num_cgroups enabled cpuset 1 1 1 ... Fix this by waiting for a while when umount. Now run the script again and you'll see: # cat /proc/cgroups | grep cpuset #subsys_name hierarchy num_cgroups enabled cpuset 0 1 1 ... Cc: stable@vger.kernel.org # 3.19+ Signed-off-by: chenridong --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup/cgroup.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..a864bcba7147 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -556,6 +556,9 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + /* A list running through the active hierarchies */ struct list_head root_list; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 41b16ce99f54..3b5d1999f4fd 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2018,6 +2018,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); + init_waitqueue_head(&root->wait); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; @@ -2254,6 +2255,17 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); + /* + * Wait if there are cgroups being destroyed, because the destruction + * is asynchronous. On the other hand some controllers like memcg + * may pin cgroups for a very long time, so don't wait forever. + */ + if (root != &cgrp_dfl_root) { + wait_event_timeout(root->wait, + list_empty(&root->cgrp.self.children), + msecs_to_jiffies(500)); + } + /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). @@ -5445,6 +5457,9 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (css->parent && !css->parent->parent && + list_empty(&css->parent->children)) + wake_up(&cgrp->root->wait); } cgroup_unlock(); -- Gitee From 9b79fa9bedcaa9405291dda04d294733a73c867b Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Fri, 22 Dec 2023 09:24:52 +0000 Subject: [PATCH 3/4] cgroup: Return ERSCH when add Z process into task hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: NA -------------------------- When echo a Z process into tasks, it should return -ERSCH instead of 0. Signed-off-by: Lu Jialin Signed-off-by: Zheng Zengkai Signed-off-by: chenridong --- kernel/cgroup/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3b5d1999f4fd..6b47164ad9b4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2818,6 +2818,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; + int err = 0; /* * The following thread iteration should be inside an RCU critical @@ -2828,12 +2829,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, task = leader; do { cgroup_migrate_add_task(task, mgctx); - if (!threadgroup) + if (!threadgroup) { + if (task->flags & PF_EXITING) + err = -ESRCH; break; + } } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx); + return err ? err : cgroup_migrate_execute(mgctx); } /** -- Gitee From 494beb72270ede16efe4addc9d776d84871544f3 Mon Sep 17 00:00:00 2001 From: chenridong Date: Fri, 22 Dec 2023 09:24:53 +0000 Subject: [PATCH 4/4] cgroup: disable kernel memory accounting for all memory cgroups by default hulk inclusion category: bugfix Bugzilla: https://gitee.com/openeuler/kernel/issues/I8QLND CVE: NA ---------------------------------------- The kernel memory accounting for all memory cgroups is not stable, and it will cause a 100% regression in hackbench compared with kernel-4.19, so disable it by default. We can use the following command line to enable or disable it: cgroup.memory=kmem or cgroup.memory=kmem. Signed-off-by: Yang Yingliang Signed-off-by: Zheng Zengkai Signed-off-by: chenridong --- Documentation/admin-guide/cgroup-v1/memory.rst | 6 +++--- Documentation/admin-guide/kernel-parameters.txt | 1 + mm/memcontrol.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ff456871bf4b..edd795855d68 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -320,9 +320,9 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it possible to DoS the system by consuming too much of this precious resource. -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. +Kernel memory accounting is disabled for all memory cgroups by default. But +it can be enabled system-wide by passing cgroup.memory=kmem to the kernel +at boot time. In this case, kernel memory will all be accounted. Kernel memory limits are not imposed for the root cgroup. Usage for the root cgroup may or may not be accounted. The memory used is accumulated into diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358..022a06068306 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -584,6 +584,7 @@ Format: nosocket -- Disable socket memory accounting. nokmem -- Disable kernel memory accounting. + kmem -- Enable kernel memory accounting. nobpf -- Disable BPF memory accounting. checkreqprot= [SELINUX] Set initial checkreqprot flag value. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d9a873e5522..a16ea4fa154a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,7 +87,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init = true; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; @@ -7563,6 +7563,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nosocket = true; if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; + else if (!strcmp(token, "kmem")) + cgroup_memory_nokmem = false; if (!strcmp(token, "nobpf")) cgroup_memory_nobpf = true; } -- Gitee