diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 63abdb3f8c635e5ac12a3fcd9fcc71aca2c2f5e5..eaeb0628fc21a754ae12b358adbedee37ab7fa5b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -177,6 +177,7 @@ CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y +CONFIG_CGROUP_FILES=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_TIME_NS=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 0e05e7a15fdb6c360159b05f7e42db4523d910c5..380f45a73254285b77a2e87791b862ec394c8419 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -200,6 +200,7 @@ CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_MISC is not set # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y +CONFIG_CGROUP_FILES=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_TIME_NS=y diff --git a/fs/Makefile b/fs/Makefile index 5bfdbf0d70373e015af5fe037dde7c0e5e8291e9..5030041ea469abda36f42a86905a7170f37dd1d1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_CGROUP_FILES) += filescontrol.o obj-y += iomap/ obj-y += quota/ diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fdab6bdf868f63e61c69c84ee74aa98f72..988f763bda7846b1b8e51f7f25c5666402984d68 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1835,6 +1835,18 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) return dentry; } +static inline bool d_forbid_overflow(struct dentry *dentry) +{ + if (unlikely(d_count(dentry) >= D_COUNT_MAX)) { + shrink_dcache_parent(dentry); + + if (d_count(dentry) >= D_COUNT_MAX) + return false; + } + + return true; +} + /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -1846,9 +1858,15 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { - struct dentry *dentry = __d_alloc(parent->d_sb, name); + struct dentry *dentry = NULL; + + if (unlikely(!d_forbid_overflow(parent))) + goto out; + + dentry = __d_alloc(parent->d_sb, name); if (!dentry) - return NULL; + goto out; + spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1858,7 +1876,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); - +out: return dentry; } EXPORT_SYMBOL(d_alloc); @@ -1871,11 +1889,17 @@ EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { - struct dentry *dentry = d_alloc_anon(parent->d_sb); + struct dentry *dentry = NULL; + + if (unlikely(!d_forbid_overflow(parent))) + goto out; + + dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } +out: return dentry; } diff --git a/fs/file.c b/fs/file.c index 7893ea161d77075ca0ab524d7c8021ab668c0978..68f5c936a236a37a583de563909269bc38504e89 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; +#ifdef CONFIG_CGROUP_FILES + files_cgroup_assign(newf); +#endif spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -400,10 +404,29 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); +#ifdef CONFIG_CGROUP_FILES + if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf))) + return newf; + +/* could not get enough FD resources. Need to clean up. */ + new_fds = new_fdt->fd; + for (i = open_files; i != 0; i--) { + struct file *f = *new_fds++; + if (f) + fput(f); + } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); + *errorp = -EMFILE; +#else return newf; +#endif out_release: +#ifdef CONFIG_CGROUP_FILES + files_cgroup_remove(newf); +#endif kmem_cache_free(files_cachep, newf); out: return NULL; @@ -429,6 +452,9 @@ static struct fdtable *close_files(struct files_struct * files) if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { +#ifdef CONFIG_CGROUP_FILES + files_cgroup_unalloc_fd(files, 1); +#endif filp_close(file, files); cond_resched(); } @@ -531,6 +557,12 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags) */ if (error) goto repeat; +#ifdef CONFIG_CGROUP_FILES + if (files_cgroup_alloc_fd(files, 1)) { + error = -EMFILE; + goto out; + } +#endif if (start <= files->next_fd) files->next_fd = fd + 1; @@ -568,6 +600,10 @@ EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); +#ifdef CONFIG_CGROUP_FILES + if (test_bit(fd, fdt->open_fds)) + files_cgroup_unalloc_fd(files, 1); +#endif __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; @@ -1090,6 +1126,7 @@ static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { + int err; struct file *tofree; struct fdtable *fdt; @@ -1109,8 +1146,16 @@ __releases(&files->file_lock) */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; - if (!tofree && fd_is_open(fd, fdt)) - goto Ebusy; + if (!tofree && fd_is_open(fd, fdt)) { + err = -EBUSY; + goto out; + } +#ifdef CONFIG_CGROUP_FILES + if (!tofree && files_cgroup_alloc_fd(files, 1)) { + err = -EMFILE; + goto out; + } +#endif get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); @@ -1125,9 +1170,9 @@ __releases(&files->file_lock) return fd; -Ebusy: +out: spin_unlock(&files->file_lock); - return -EBUSY; + return err; } int replace_fd(unsigned fd, struct file *file, unsigned flags) diff --git a/fs/filescontrol.c b/fs/filescontrol.c new file mode 100644 index 0000000000000000000000000000000000000000..fdd557a246be1541494f9cfa38c3b2e243180132 --- /dev/null +++ b/fs/filescontrol.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0 +/* filescontrol.c - Cgroup controller for open file handles. + * + * Copyright 2014 Google Inc. + * Author: Brian Makin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FILES_MAX D_COUNT_MAX +#define FILES_MAX_STR "max" + +static bool no_acct; +struct cgroup_subsys files_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(files_cgrp_subsys); + +module_param(no_acct, bool, 0444); + +struct files_cgroup { + struct cgroup_subsys_state css; + struct page_counter open_handles; +}; + +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct files_cgroup, css) : NULL; +} + +static inline struct page_counter * +css_res_open_handles(struct cgroup_subsys_state *css) +{ + return &css_fcg(css)->open_handles; +} + +static inline struct files_cgroup * +files_cgroup_from_files(struct files_struct *files) +{ + return files->files_cgroup; +} + + +static struct cgroup_subsys_state * +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct files_cgroup *parent_fcg; + struct files_cgroup *fcg; + + parent_fcg = css_fcg(parent_css); + fcg = kzalloc(sizeof(*fcg), GFP_KERNEL); + if (!fcg) + goto out; + + if (!parent_fcg) { + page_counter_init(&fcg->open_handles, NULL); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } else { + struct page_counter *p_counter = &parent_fcg->open_handles; + + page_counter_init(&fcg->open_handles, p_counter); + page_counter_set_max(&fcg->open_handles, FILES_MAX); + } + return &fcg->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void files_cgroup_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_fcg(css)); +} + +u64 files_cgroup_count_fds(struct files_struct *files) +{ + int i; + struct fdtable *fdt; + int retval = 0; + + fdt = files_fdtable(files); + for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++) + retval += hweight64((__u64)fdt->open_fds[i]); + return retval; +} + +/* + * If attaching this cgroup would overcommit the resource then deny + * the attach. If not, attach the file resource into new cgroup. + */ +static int files_cgroup_can_attach(struct cgroup_taskset *tset) +{ + u64 num_files; + bool can_attach; + struct cgroup_subsys_state *to_css; + struct cgroup_subsys_state *from_css; + struct page_counter *from_res; + struct page_counter *to_res; + struct page_counter *fail_res; + struct files_struct *files; + struct task_struct *task = cgroup_taskset_first(tset, &to_css); + + to_res = css_res_open_handles(to_css); + + task_lock(task); + files = task->files; + if (!files || files == &init_files) { + task_unlock(task); + return 0; + } + + from_css = &files_cgroup_from_files(files)->css; + from_res = css_res_open_handles(from_css); + + spin_lock(&files->file_lock); + num_files = files_cgroup_count_fds(files); + page_counter_uncharge(from_res, num_files); + + if (!page_counter_try_charge(to_res, num_files, &fail_res)) { + page_counter_charge(from_res, num_files); + pr_err("Open files limit overcommited\n"); + can_attach = false; + } else { + css_put(from_css); + css_get(to_css); + task->files->files_cgroup = css_fcg(to_css); + can_attach = true; + } + spin_unlock(&files->file_lock); + task_unlock(task); + return can_attach ? 0 : -ENOSPC; +} + +int files_cgroup_alloc_fd(struct files_struct *files, u64 n) +{ + /* + * Kernel threads which are forked by kthreadd inherited the + * const files_struct 'init_files', we didn't wrap it so + * there's no associated files_cgroup. + * + * Kernel threads always stay in root cgroup, and we don't + * have limit for root files cgroup, so it won't hurt if + * we don't charge their fds, only issue is that files.usage + * won't be accurate in root files cgroup. + */ + if (!no_acct && files != &init_files) { + struct page_counter *fail_res; + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + if (!page_counter_try_charge(&files_cgroup->open_handles, + n, &fail_res)) + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL(files_cgroup_alloc_fd); + +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n) +{ + /* + * It's not charged so no need to uncharge, see comments in + * files_cgroup_alloc_fd. + */ + if (!no_acct && files != &init_files) { + struct files_cgroup *files_cgroup = + files_cgroup_from_files(files); + page_counter_uncharge(&files_cgroup->open_handles, n); + } +} +EXPORT_SYMBOL(files_cgroup_unalloc_fd); + +static u64 files_disabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return no_acct; +} + +static int files_disabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (!val) + return -EINVAL; + no_acct = true; + + return 0; +} + +static int files_limit_read(struct seq_file *sf, void *v) +{ + struct files_cgroup *fcg = css_fcg(seq_css(sf)); + struct page_counter *counter = &fcg->open_handles; + u64 limit = counter->max; + + if (limit >= FILES_MAX) + seq_printf(sf, "%s\n", FILES_MAX_STR); + else + seq_printf(sf, "%llu\n", limit); + + return 0; +} + +static ssize_t files_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct files_cgroup *fcg = css_fcg(of_css(of)); + u64 limit; + int err; + + buf = strstrip((char *)buf); + if (!strcmp(buf, FILES_MAX_STR)) { + limit = FILES_MAX; + goto set_limit; + } + + err = kstrtoull(buf, 0, &limit); + if (err) + return err; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + page_counter_set_max(&fcg->open_handles, limit); + return nbytes; +} + + +static u64 files_usage_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct files_cgroup *fcg = css_fcg(css); + + return page_counter_read(&fcg->open_handles); +} + +static struct cftype files[] = { + { + .name = "limit", + .seq_show = files_limit_read, + .write = files_limit_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "usage", + .read_u64 = files_usage_read, + }, + { + .name = "no_acct", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = files_disabled_read, + .write_u64 = files_disabled_write, + }, + { } +}; + +struct cgroup_subsys files_cgrp_subsys = { + .css_alloc = files_cgroup_css_alloc, + .css_free = files_cgroup_css_free, + .can_attach = files_cgroup_can_attach, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; + +/* + * It could race against cgroup migration of current task, and + * using task_get_css() to get a valid css. + */ +void files_cgroup_assign(struct files_struct *files) +{ + struct cgroup_subsys_state *css; + + if (files == &init_files) + return; + + css = task_get_css(current, files_cgrp_id); + files->files_cgroup = container_of(css, struct files_cgroup, css); +} + +void files_cgroup_remove(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_cgroup *fcg; + + if (files == &init_files) + return; + + task_lock(tsk); + spin_lock(&files->file_lock); + fcg = files_cgroup_from_files(files); + css_put(&fcg->css); + spin_unlock(&files->file_lock); + task_unlock(tsk); +} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a0d5466c7be1533d29d8d5d5ece2cec919b2886..844ccf4187ed8931afcd7ce85cf358d217437604 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -639,6 +639,12 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); + int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, + struct seq_file *m); + + int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + const char *buffer); + __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); @@ -726,7 +732,7 @@ struct cgroup_subsys { */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - + struct cftype *base_cftypes; /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230739c5c0ac0d581f5b5930f97c238..f57184cad7f68c8005bc6dfbdb691e7110957560 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,6 +72,12 @@ SUBSYS(misc) SUBSYS(debug) #endif +#if IS_ENABLED(CONFIG_CGROUP_FILES) +SUBSYS(files) +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ + + diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519fbf2babb6c04573fa4783ebba7e2..22b8b03fef6d419b06b549703f30a8c8ea09adce 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -65,6 +65,7 @@ struct files_struct { unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; + struct files_cgroup *files_cgroup; }; struct file_operations; diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h new file mode 100644 index 0000000000000000000000000000000000000000..49dc620cf64ec7fca7385cf8b544b4c6062c79f1 --- /dev/null +++ b/include/linux/filescontrol.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* filescontrol.h - Files Controller + * + * Copyright 2014 Google Inc. + * Author: Brian Makin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_FILESCONTROL_H +#define _LINUX_FILESCONTROL_H + +#include + +#ifdef CONFIG_CGROUP_FILES + +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n); +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n); +extern u64 files_cgroup_count_fds(struct files_struct *files); +extern struct files_struct init_files; + +void files_cgroup_assign(struct files_struct *files); +void files_cgroup_remove(struct files_struct *files); + +#endif /* CONFIG_CGROUP_FILES */ +#endif /* _LINUX_FILESCONTROL_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb2411478a2dbaacbbee0fec9336c27b..2921f5d65b625749d7906e4e0524c69bdfab621c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -47,6 +47,9 @@ #include #include +#define D_COUNT_MAX (INT_MAX / 2) + + struct backing_dev_info; struct bdi_writeback; struct bio; diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ced953390fb2b0975d085c813525c7..bec1cba111c5fa35747ad61353e437c4a2506240 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1174,6 +1174,16 @@ config SOCK_CGROUP_DATA bool default n +config CGROUP_FILES + bool "Files Resource Controller for Control Groups" + select PAGE_COUNTER + default n + help + Provides a cgroup resource controller that limits number of open + file handles within a cgroup. + This supports catching misbehaving processes and + return EMFILE instead of ENOMEM for kernel memory limits. + endif # CGROUPS menuconfig NAMESPACES