diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ff456871bf4b8b74a7e74bea664e0e6fa5e3910b..edd795855d68dc3b55d2917f78dd88b83877c964 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -320,9 +320,9 @@ the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it possible to DoS the system by consuming too much of this precious resource. -Kernel memory accounting is enabled for all memory cgroups by default. But -it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel -at boot time. In this case, kernel memory will not be accounted at all. +Kernel memory accounting is disabled for all memory cgroups by default. But +it can be enabled system-wide by passing cgroup.memory=kmem to the kernel +at boot time. In this case, kernel memory will all be accounted. Kernel memory limits are not imposed for the root cgroup. Usage for the root cgroup may or may not be accounted. The memory used is accumulated into diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 41644336e358727b5b9b184761e1d11b1332fcca..022a06068306b6997ac93bce8448a573b1f760fe 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -584,6 +584,7 @@ Format: nosocket -- Disable socket memory accounting. nokmem -- Disable kernel memory accounting. + kmem -- Enable kernel memory accounting. nobpf -- Disable BPF memory accounting. checkreqprot= [SELINUX] Set initial checkreqprot flag value. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b1b34c2405a6ae1e31d18cadb389f..a864bcba71478ddb063ac4dfd9193fca73c4be10 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -556,6 +556,9 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + /* A list running through the active hierarchies */ struct list_head root_list; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 134a15e1d83a9af2af2c8174e7ff3ac9dca6a751..f4d9745a48d9f5a947e6df064acfdab21f2169c5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -675,6 +675,7 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; + bool dead; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* @@ -682,11 +683,13 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + dead = percpu_ref_is_dying(&ss->root->cgrp.self.refcnt); seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); + ss->legacy_name, dead ? 0 : ss->root->hierarchy_id, + dead ? 0 : atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + } return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 41b16ce99f54d63c7b7c77edf3cd8e98ad49fba5..6b47164ad9b4a1a2e9d7f5c391360f102db887b2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2018,6 +2018,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); + init_waitqueue_head(&root->wait); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; @@ -2254,6 +2255,17 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); + /* + * Wait if there are cgroups being destroyed, because the destruction + * is asynchronous. On the other hand some controllers like memcg + * may pin cgroups for a very long time, so don't wait forever. + */ + if (root != &cgrp_dfl_root) { + wait_event_timeout(root->wait, + list_empty(&root->cgrp.self.children), + msecs_to_jiffies(500)); + } + /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). @@ -2806,6 +2818,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; + int err = 0; /* * The following thread iteration should be inside an RCU critical @@ -2816,12 +2829,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, task = leader; do { cgroup_migrate_add_task(task, mgctx); - if (!threadgroup) + if (!threadgroup) { + if (task->flags & PF_EXITING) + err = -ESRCH; break; + } } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); - return cgroup_migrate_execute(mgctx); + return err ? err : cgroup_migrate_execute(mgctx); } /** @@ -5445,6 +5461,9 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (css->parent && !css->parent->parent && + list_empty(&css->parent->children)) + wake_up(&cgrp->root->wait); } cgroup_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d9a873e552244d8f210d189c4e1ae53254d8b4f..a16ea4fa154adf5ba2efee8925c313680ccccced 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,7 +87,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init = true; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; @@ -7563,6 +7563,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nosocket = true; if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; + else if (!strcmp(token, "kmem")) + cgroup_memory_nokmem = false; if (!strcmp(token, "nobpf")) cgroup_memory_nobpf = true; }