diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6e3227a688deb736f0012834ff883428dab8d470..f3fd0407d346f34996741aef5afc19bbaf517e6d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -325,6 +325,8 @@ struct cgroup_base_stat { #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif + KABI_RESERVE(1) + KABI_RESERVE(2) }; /* @@ -555,6 +557,9 @@ struct cgroup { KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) /* All ancestors including self */ struct cgroup *ancestors[]; }; @@ -573,6 +578,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; + /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -589,9 +598,6 @@ struct cgroup_root { /* Wait while cgroups are being destroyed */ wait_queue_head_t wait; - /* A list running through the active hierarchies */ - struct list_head root_list; - /* Hierarchy-specific flags */ unsigned int flags; @@ -605,6 +611,8 @@ struct cgroup_root { KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) }; /* diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b2a80e089a0acf9fd1b73b84da93a7b8c8e0f8c6..abe236201e68fdf2e8cc029499962e6684f5ee85 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,6 +429,14 @@ struct mem_cgroup { KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) + KABI_RESERVE(9) + KABI_RESERVE(10) + KABI_RESERVE(11) + KABI_RESERVE(12) + KABI_RESERVE(13) + KABI_RESERVE(14) + KABI_RESERVE(15) + KABI_RESERVE(16) struct mem_cgroup_per_node *nodeinfo[]; }; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 96a9bd2c26f0a6570d9e03c35a311472549aec3a..f5fb12890645c41b581d6d241e7eeb0070f7acf1 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 52fe6ba2fefd28b62c41d12d58870ead0033e878..c26a9b3a35768df637484f1549f5ca3165255af7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; } @@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2c9e50f09fc1dcd0bb992814552a27568b9f80f3..7ea0a6d0051920af059c3c61af71f59c1b000642 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" #include #include @@ -210,11 +211,6 @@ struct cpuset { /* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; - - KABI_RESERVE(1) - KABI_RESERVE(2) - KABI_RESERVE(3) - KABI_RESERVE(4) }; /* @@ -5185,40 +5181,20 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, char *buf; struct cgroup_subsys_state *css; int retval; - struct cgroup *root_cgroup = NULL; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); rcu_read_lock(); - /* - * When the cpuset subsystem is mounted on the legacy hierarchy, - * the top_cpuset.css->cgroup does not hold a reference count of - * cgroup_root.cgroup. This makes accessing css->cgroup very - * dangerous because when the cpuset subsystem is remounted to the - * default hierarchy, the cgroup_root.cgroup that css->cgroup points - * to will be released, leading to a UAF issue. To avoid this problem, - * get the reference count of top_cpuset.css->cgroup first. - * - * This is ugly!! - */ - if (css == &top_cpuset.css) { - root_cgroup = css->cgroup; - if (!css_tryget_online(&root_cgroup->self)) { - rcu_read_unlock(); - retval = -EBUSY; - goto out_free; - } - } + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); rcu_read_unlock(); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (root_cgroup) - css_put(&root_cgroup->self); + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0)