diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 09f2d58d119b168555d1d83082b1f56c7b9dfad2..36103ca580dc90c0548ed4246537eee13bff5d46 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -556,6 +556,15 @@ struct cgroup_root { KABI_RESERVE(4) }; +/* + * To keep kabi uncharged, add cgroup_root_ext, add rcu_head to make operations + * on the cgroup root_list RCU safe + */ +struct cgroup_root_ext { + struct rcu_head rcu; /* Must be near the top */ + struct cgroup_root root; +}; + /* * struct cftype: handler definitions for cgroup control files * diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 096cee0e111a4fd4f5aaf7ce544c76e0f3d7e324..aabc2a89d6b52823e55333a135a4785b0330acee 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -173,7 +173,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8798131175560fef40c728927b2b2fa92558a72b..c8500c3a93405943c67ee1111926e4178e77cdc2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1143,6 +1143,7 @@ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; + struct cgroup_root_ext *root_ext; struct cgroup_subsys *ss; int i, ret; @@ -1215,10 +1216,11 @@ static int cgroup1_root_to_use(struct fs_context *fc) if (ctx->ns != &init_cgroup_ns) return -EPERM; - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + root_ext = kzalloc(sizeof(struct cgroup_root_ext), GFP_KERNEL); + if (!root_ext) return -ENOMEM; + root = &root_ext->root; ctx->root = root; init_cgroup_root(ctx); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 69b6bbaf28a31d75ba3484ba2ca1b463b296bde1..34647f8d677869b309560cdc069fd37e67dcceee 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1310,7 +1310,10 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + struct cgroup_root_ext *root_ext; + + root_ext = container_of(root, struct cgroup_root_ext, root); + kfree_rcu(root_ext, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1343,7 +1346,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; } @@ -1389,7 +1392,6 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) } rcu_read_unlock(); - BUG_ON(!res); return res; } @@ -1399,7 +1401,6 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, { struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { @@ -1425,7 +1426,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1964,7 +1967,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2047,7 +2050,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f127b7569c367fb9ca62a282d52110ce24080a6d..7ecff06d202695862d705b609ef84d95f2c8bce8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -22,6 +22,7 @@ * distribution for more details. */ +#include "cgroup-internal.h" #include #include #include @@ -3887,40 +3888,20 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, char *buf; struct cgroup_subsys_state *css; int retval; - struct cgroup *root_cgroup = NULL; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); rcu_read_lock(); - /* - * When the cpuset subsystem is mounted on the legacy hierarchy, - * the top_cpuset.css->cgroup does not hold a reference count of - * cgroup_root.cgroup. This makes accessing css->cgroup very - * dangerous because when the cpuset subsystem is remounted to the - * default hierarchy, the cgroup_root.cgroup that css->cgroup points - * to will be released, leading to a UAF issue. To avoid this problem, - * get the reference count of top_cpuset.css->cgroup first. - * - * This is ugly!! - */ - if (css == &top_cpuset.css) { - root_cgroup = css->cgroup; - if (!css_tryget_online(&root_cgroup->self)) { - rcu_read_unlock(); - retval = -EBUSY; - goto out_free; - } - } + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); rcu_read_unlock(); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (root_cgroup) - css_put(&root_cgroup->self); + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0)