diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 47263cecb12f4c099829c162ef6d28cda488d943..b3ab93dc5f35289aca6eb4ef3c5b3bf8d8999408 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -556,6 +556,15 @@ struct cgroup_root { KABI_RESERVE(4) }; +/* + * To keep kabi uncharged, add cgroup_root_ext, add rcu_head to make operations + * on the cgroup root_list RCU safe + */ +struct cgroup_root_ext { + struct rcu_head rcu; /* Must be near the top */ + struct cgroup_root root; +}; + /* * struct cftype: handler definitions for cgroup control files * diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 241a2d8c16e722ccaeeae14c3581205ac111507c..5e23b75f226f2a26938c4c16afa35a9b0fb36fd3 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -173,7 +173,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 647d0891cff68c37a59703473da3336e8f661f7b..248e5e0fbe4f79e711986b497557bc1d609b2cbf 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1145,6 +1145,7 @@ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; + struct cgroup_root_ext *root_ext; struct cgroup_subsys *ss; int i, ret; @@ -1217,10 +1218,11 @@ static int cgroup1_root_to_use(struct fs_context *fc) if (ctx->ns != &init_cgroup_ns) return -EPERM; - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + root_ext = kzalloc(sizeof(struct cgroup_root_ext), GFP_KERNEL); + if (!root_ext) return -ENOMEM; + root = &root_ext->root; ctx->root = root; init_cgroup_root(ctx); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1cd8deb876db5544eb6cf914551b79be2ce19aed..ce4ad748d2c9cb331d1bf892a922761a3c02a3c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1303,7 +1303,10 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + struct cgroup_root_ext *root_ext; + + root_ext = container_of(root, struct cgroup_root_ext, root); + kfree_rcu(root_ext, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1336,7 +1339,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; } @@ -1382,7 +1385,6 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) } rcu_read_unlock(); - BUG_ON(!res); return res; } @@ -1392,7 +1394,6 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, { struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { @@ -1418,7 +1419,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1957,7 +1960,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2040,7 +2043,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ba8332982d51f52db8f56091f8d3d5cdf3df485f..3467abdd1d72b0fd670f3c5528dcd4651d539e32 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -22,6 +22,7 @@ * distribution for more details. */ +#include "cgroup-internal.h" #include #include #include @@ -3661,40 +3662,20 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, char *buf; struct cgroup_subsys_state *css; int retval; - struct cgroup *root_cgroup = NULL; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); rcu_read_lock(); - /* - * When the cpuset subsystem is mounted on the legacy hierarchy, - * the top_cpuset.css->cgroup does not hold a reference count of - * cgroup_root.cgroup. This makes accessing css->cgroup very - * dangerous because when the cpuset subsystem is remounted to the - * default hierarchy, the cgroup_root.cgroup that css->cgroup points - * to will be released, leading to a UAF issue. To avoid this problem, - * get the reference count of top_cpuset.css->cgroup first. - * - * This is ugly!! - */ - if (css == &top_cpuset.css) { - root_cgroup = css->cgroup; - if (!css_tryget_online(&root_cgroup->self)) { - rcu_read_unlock(); - retval = -EBUSY; - goto out_free; - } - } + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); rcu_read_unlock(); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); - if (root_cgroup) - css_put(&root_cgroup->self); + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0)