[linux-yocto] [PATCH] [RFC] cgroups: Resource controller for open files

Tue Jul 29 20:06:50 PDT 2014

On 2014-07-29, 1:58 AM, zhe.he at windriver.com wrote:
> From: He Zhe <zhe.he at windriver.com>

How did you extract this patch from the mailing list ? The
From: field should not be changing, since you aren't the
original author of the patch.

>
> Add a resource controller for limiting the number of open
> file handles.  This allows us to catch misbehaving processes
> and return EMFILE instead of ENOMEM for kernel memory limits.
>
> Signed-off-by: Binder Makin <merimus at google.com>
>
> Port from lkml: https://lkml.org/lkml/2014/7/2/640
> Correct wrong handling in fs/file.c:do_dup2

This should be in a separate patch.

So let's see a new send of this, with patch 1/2 being the upstream
change (with the author information intact) and the 2/2 being any
fixes you did to the original.

Bruce

>
> Signed-off-by: He Zhe <zhe.he at windriver.com>
> ---
>   fs/Makefile                   |    1 +
>   fs/file.c                     |   46 ++++++++
>   fs/filescontrol.c             |  249 +++++++++++++++++++++++++++++++++++++++++
>   include/linux/cgroup_subsys.h |    5 +
>   include/linux/fdtable.h       |    3 +
>   include/linux/filescontrol.h  |   32 ++++++
>   init/Kconfig                  |    7 ++
>   7 files changed, 343 insertions(+)
>   create mode 100644 fs/filescontrol.c
>   create mode 100644 include/linux/filescontrol.h
>
> diff --git a/fs/Makefile b/fs/Makefile
> index ebfe2ee..18eaee0 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -48,6 +48,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
>   obj-$(CONFIG_SYSCTL)		+= drop_caches.o
>
>   obj-$(CONFIG_FHANDLE)		+= fhandle.o
> +obj-$(CONFIG_CGROUP_FILES)	+= filescontrol.o
>
>   obj-y				+= quota/
>
> diff --git a/fs/file.c b/fs/file.c
> index eb56a13..e615dc9 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -22,6 +22,7 @@
>   #include <linux/spinlock.h>
>   #include <linux/rcupdate.h>
>   #include <linux/workqueue.h>
> +#include <linux/filescontrol.h>
>
>   int sysctl_nr_open __read_mostly = 1024*1024;
>   int sysctl_nr_open_min = BITS_PER_LONG;
> @@ -264,6 +265,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
>   	new_fdt->close_on_exec = newf->close_on_exec_init;
>   	new_fdt->open_fds = newf->open_fds_init;
>   	new_fdt->fd = &newf->fd_array[0];
> +#ifdef CONFIG_CGROUP_FILES
> +	files_cgroup_assign(newf);
> +#endif
>
>   	spin_lock(&oldf->file_lock);
>   	old_fdt = files_fdtable(oldf);
> @@ -340,9 +344,28 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
>
>   	rcu_assign_pointer(newf->fdt, new_fdt);
>
> +#ifdef CONFIG_CGROUP_FILES
> +	if (!files_cgroup_alloc_fd(newf, files_cgroup_count_fds(newf)))
> +		return newf;
> +
> +/* could not get enough FD resources.  Need to clean up. */
> +	new_fds = new_fdt->fd;
> +	for (i = open_files; i != 0; i--) {
> +		struct file *f = *new_fds++;
> +		if (f)
> +			fput(f);
> +	}
> +	if (new_fdt != &newf->fdtab)
> +		__free_fdtable(new_fdt);
> +	*errorp = -EMFILE;
> +#else
>   	return newf;
> +#endif
>
>   out_release:
> +#ifdef CONFIG_CGROUP_FILES
> +	files_cgroup_remove(newf);
> +#endif
>   	kmem_cache_free(files_cachep, newf);
>   out:
>   	return NULL;
> @@ -368,6 +391,9 @@ static struct fdtable *close_files(struct files_struct * files)
>   			if (set & 1) {
>   				struct file * file = xchg(&fdt->fd[i], NULL);
>   				if (file) {
> +#ifdef CONFIG_CGROUP_FILES
> +					files_cgroup_unalloc_fd(files, 1);
> +#endif
>   					filp_close(file, files);
>   					cond_resched();
>   				}
> @@ -486,6 +512,13 @@ repeat:
>   	if (error)
>   		goto repeat;
>
> +#ifdef CONFIG_CGROUP_FILES
> +	if (files_cgroup_alloc_fd(files, 1)) {
> +		error = -EMFILE;
> +		goto out;
> +	}
> +#endif
> +
>   	if (start <= files->next_fd)
>   		files->next_fd = fd + 1;
>
> @@ -522,6 +555,10 @@ EXPORT_SYMBOL(get_unused_fd_flags);
>   static void __put_unused_fd(struct files_struct *files, unsigned int fd)
>   {
>   	struct fdtable *fdt = files_fdtable(files);
> +#ifdef CONFIG_CGROUP_FILES
> +	if (test_bit(fd, fdt->open_fds))
> +		files_cgroup_unalloc_fd(files, 1);
> +#endif
>   	__clear_open_fd(fd, fdt);
>   	if (fd < files->next_fd)
>   		files->next_fd = fd;
> @@ -780,6 +817,15 @@ static int do_dup2(struct files_struct *files,
>   	tofree = fdt->fd[fd];
>   	if (!tofree && fd_is_open(fd, fdt))
>   		goto Ebusy;
> +
> +#ifdef CONFIG_CGROUP_FILES
> +	if (!tofree)
> +		if (files_cgroup_alloc_fd(files, 1)) {
> +			spin_unlock(&files->file_lock);
> +			return -EMFILE;
> +		}
> +#endif
> +
>   	get_file(file);
>   	rcu_assign_pointer(fdt->fd[fd], file);
>   	__set_open_fd(fd, fdt);
> diff --git a/fs/filescontrol.c b/fs/filescontrol.c
> new file mode 100644
> index 0000000..0ba8ffa
> --- /dev/null
> +++ b/fs/filescontrol.c
> @@ -0,0 +1,249 @@
> +/* filescontrol.c - Cgroup controller for open file handles.
> + *
> + * Copyright 2014 Google Inc.
> + * Author: Brian Makin <merimus at google.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/res_counter.h>
> +#include <linux/filescontrol.h>
> +#include <linux/cgroup.h>
> +#include <linux/export.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/fdtable.h>
> +
> +struct cgroup_subsys files_subsys __read_mostly;
> +EXPORT_SYMBOL(files_subsys);
> +
> +struct files_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct res_counter open_handles;
> +};
> +
> +static inline struct files_cgroup *css_fcg(struct cgroup_subsys_state *css)
> +{
> +	return css ? container_of(css, struct files_cgroup, css) : NULL;
> +}
> +
> +static inline struct res_counter *
> +css_res_open_handles(struct cgroup_subsys_state *css)
> +{
> +	return &css_fcg(css)->open_handles;
> +}
> +
> +static inline struct files_cgroup *
> +files_cgroup_from_files(struct files_struct *files)
> +{
> +	return files->files_cgroup;
> +}
> +
> +static struct cgroup_subsys_state *
> +files_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> +{
> +	struct files_cgroup *fcg;
> +
> +	fcg = kzalloc(sizeof(*fcg), GFP_KERNEL);
> +	if (!fcg)
> +		goto out;
> +
> +	if (!parent_css) {
> +		res_counter_init(&fcg->open_handles, NULL);
> +		res_counter_set_limit(&fcg->open_handles, get_max_files());
> +	} else {
> +		struct files_cgroup *parent_fcg = css_fcg(parent_css);
> +		res_counter_init(&fcg->open_handles, &parent_fcg->open_handles);
> +		res_counter_set_limit(&fcg->open_handles,
> +				res_counter_read_u64(&parent_fcg->open_handles,
> +						RES_LIMIT));
> +	}
> +	return &fcg->css;
> +
> +out:
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +static void files_cgroup_css_free(struct cgroup_subsys_state *css)
> +{
> +	kfree(css_fcg(css));
> +}
> +
> +u64 files_cgroup_count_fds(struct files_struct *files)
> +{
> +	int i;
> +	struct fdtable *fdt;
> +	int retval = 0;
> +
> +	fdt = files_fdtable(files);
> +	for (i = 0; i < DIV_ROUND_UP(fdt->max_fds, BITS_PER_LONG); i++)
> +		retval += hweight64((__u64)fdt->open_fds[i]);
> +	return retval;
> +}
> +
> +static u64 files_in_taskset(struct cgroup_taskset *tset)
> +{
> +	struct cgroup_subsys_state *css = NULL;
> +	struct task_struct *task;
> +	u64 files = 0;
> +	cgroup_taskset_for_each(task, css, tset) {
> +		if (!thread_group_leader(task))
> +			continue;
> +
> +		task_lock(task);
> +		files += files_cgroup_count_fds(task->files);
> +		task_unlock(task);
> +	}
> +	return files;
> +}
> +
> +/*
> + * If attaching this cgroup would overcommit the resource then deny
> + * the attach.
> + */
> +static int files_cgroup_can_attach(struct cgroup_subsys_state *css,
> +				struct cgroup_taskset *tset)
> +{
> +	u64 files = files_in_taskset(tset);
> +	if (res_counter_margin(css_res_open_handles(css)) < files)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +/*
> + * If resource counts have gone up between can_attach and attach then
> + * this may overcommit resources.  In that case just deny further allocation
> + * until the resource usage drops.
> + */
> +static void files_cgroup_attach(struct cgroup_subsys_state *to_css,
> +				struct cgroup_taskset *tset)
> +{
> +	u64 num_files;
> +	struct task_struct *task = cgroup_taskset_first(tset);
> +	struct cgroup_subsys_state *from_css;
> +	struct res_counter *from_res;
> +	struct res_counter *to_res = css_res_open_handles(to_css);
> +	struct res_counter *fail_res;
> +	struct files_struct *files;
> +
> +	task_lock(task);
> +	files = task->files;
> +	if (!files) {
> +		task_unlock(task);
> +		return;
> +	}
> +
> +	from_css = &files_cgroup_from_files(files)->css;
> +	from_res = css_res_open_handles(from_css);
> +
> +	spin_lock(&files->file_lock);
> +	num_files = files_cgroup_count_fds(files);
> +	res_counter_uncharge(from_res, num_files);
> +	css_put(from_css);
> +
> +	if (res_counter_charge(to_res, num_files, &fail_res))
> +		pr_err("Open files limit overcommited\n");
> +	css_get(to_css);
> +
> +	task->files->files_cgroup = css_fcg(to_css);
> +	spin_unlock(&files->file_lock);
> +	task_unlock(task);
> +}
> +
> +int files_cgroup_alloc_fd(struct files_struct *files, u64 n)
> +{
> +	struct res_counter *fail_res;
> +	struct files_cgroup *files_cgroup = files_cgroup_from_files(files);
> +
> +	if (res_counter_charge(&files_cgroup->open_handles, n, &fail_res))
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(files_cgroup_alloc_fd);
> +
> +void files_cgroup_unalloc_fd(struct files_struct *files, u64 n)
> +{
> +	struct files_cgroup *files_cgroup = files_cgroup_from_files(files);
> +
> +	res_counter_uncharge(&files_cgroup->open_handles, n);
> +}
> +EXPORT_SYMBOL(files_cgroup_unalloc_fd);
> +
> +static u64 files_limit_read(struct cgroup_subsys_state *css,
> +			struct cftype *cft)
> +{
> +	struct files_cgroup *fcg = css_fcg(css);
> +	return res_counter_read_u64(&fcg->open_handles, RES_LIMIT);
> +}
> +
> +static int files_limit_write(struct cgroup_subsys_state *css,
> +			struct cftype *cft, u64 value)
> +{
> +	struct files_cgroup *fcg = css_fcg(css);
> +	return res_counter_set_limit(&fcg->open_handles, value);
> +}
> +
> +static u64 files_usage_read(struct cgroup_subsys_state *css,
> +			struct cftype *cft)
> +{
> +	struct files_cgroup *fcg = css_fcg(css);
> +	return res_counter_read_u64(&fcg->open_handles, RES_USAGE);
> +}
> +
> +static struct cftype files[] = {
> +	{
> +		.name = "limit",
> +		.read_u64 = files_limit_read,
> +		.write_u64 = files_limit_write,
> +	},
> +	{
> +		.name = "usage",
> +		.read_u64 = files_usage_read,
> +	},
> +	{ }
> +};
> +
> +struct cgroup_subsys files_subsys = {
> +	.name = "files",
> +	.css_alloc = files_cgroup_css_alloc,
> +	.css_free = files_cgroup_css_free,
> +	.can_attach = files_cgroup_can_attach,
> +	.attach = files_cgroup_attach,
> +	.base_cftypes = files,
> +	.subsys_id = files_subsys_id,
> +};
> +
> +void files_cgroup_assign(struct files_struct *files)
> +{
> +	struct task_struct *tsk = current;
> +	struct cgroup_subsys_state *css;
> +
> +	task_lock(tsk);
> +	css = task_css(tsk, files_subsys_id);
> +	css_get(css);
> +	files->files_cgroup = container_of(css, struct files_cgroup, css);
> +	task_unlock(tsk);
> +}
> +
> +void files_cgroup_remove(struct files_struct *files)
> +{
> +	struct task_struct *tsk = current;
> +	struct files_cgroup *fcg;
> +
> +	task_lock(tsk);
> +	spin_lock(&files->file_lock);
> +	fcg = files_cgroup_from_files(files);
> +	css_put(&fcg->css);
> +	spin_unlock(&files->file_lock);
> +	task_unlock(tsk);
> +}
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 7b99d71..defadc0 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -50,6 +50,11 @@ SUBSYS(net_prio)
>   #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB)
>   SUBSYS(hugetlb)
>   #endif
> +
> +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FILES)
> +SUBSYS(files)
> +#endif
> +
>   /*
>    * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
>    */
> diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> index 70e8e21..fee8e45 100644
> --- a/include/linux/fdtable.h
> +++ b/include/linux/fdtable.h
> @@ -57,6 +57,9 @@ struct files_struct {
>   	unsigned long close_on_exec_init[1];
>   	unsigned long open_fds_init[1];
>   	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
> +#ifdef CONFIG_CGROUP_FILES
> +	struct files_cgroup *files_cgroup;
> +#endif
>   };
>
>   struct file_operations;
> diff --git a/include/linux/filescontrol.h b/include/linux/filescontrol.h
> new file mode 100644
> index 0000000..e39ed2a
> --- /dev/null
> +++ b/include/linux/filescontrol.h
> @@ -0,0 +1,32 @@
> +/* filescontrol.h - Files Controller
> + *
> + * Copyright 2014 Google Inc.
> + * Author: Brian Makin <merimus at google.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#ifndef _LINUX_FILESCONTROL_H
> +#define _LINUX_FILESCONTROL_H
> +
> +#include <linux/fdtable.h>
> +
> +#ifdef CONFIG_CGROUP_FILES
> +
> +extern int files_cgroup_alloc_fd(struct files_struct *files, u64 n);
> +extern void files_cgroup_unalloc_fd(struct files_struct *files, u64 n);
> +extern u64 files_cgroup_count_fds(struct files_struct *files);
> +
> +void files_cgroup_assign(struct files_struct *files);
> +void files_cgroup_remove(struct files_struct *files);
> +
> +#endif /* CONFIG_CGROUP_FILES */
> +#endif /* _LINUX_FILESCONTROL_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index a4b4209..a8c8392 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1075,6 +1075,13 @@ config DEBUG_BLK_CGROUP
>   	Enable some debugging help. Currently it exports additional stat
>   	files in a cgroup which can be useful for debugging.
>
> +config CGROUP_FILES
> +	bool "Files Resource Controller for Control Groups"
> +	default n
> +	help
> +	  Provides a cgroup resource controller that limits number of open
> +	  file handles within a cgroup.
> +
>   endif # CGROUPS
>
>   config CHECKPOINT_RESTORE
>