linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"lizf@cn.fujitsu.com" <lizf@cn.fujitsu.com>,
	"menage@google.com" <menage@google.com>
Subject: [RFC][PATCH 4/4] cgroup-memcg fix frequent EBUSY at rmdir
Date: Thu, 8 Jan 2009 18:35:29 +0900	[thread overview]
Message-ID: <20090108183529.b4fd99f4.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20090108182556.621e3ee6.kamezawa.hiroyu@jp.fujitsu.com>

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Experimental. you may think of better fix.

When trying following test under memcg.

	create memcg under
		/cgroup/A/  use_hierarchy=1, limit=20M
			 /B some tasks
			 /C empty
			 /D empty

And run make kernel under /B (for example). This will hit limit of 20M and
hierarchical memory reclaim will scan A->B->C->D.
(C,D have to be scanned because it may have some page caches.)

	Here, run following scipt.

	while true; do
		rmdir /cgroup/A/C
		mkdir /cgroup/A/C
	done

You'll see -EBUSY at rmdir very often. This is because of temporal refcnt of
memory reclaim for safe scanning under hierarchy.

In usual, considering shell script,
	"please try again if you see -EBUSY at rmdir.
	 You may see -EBUSY at rmdir even if it seems you can do it."
is very unuseful.

This patch tries to fix -EBUSY behavior. memcg's pre_destroy() works pretty
fine and retrying rmdir() is O.K.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

---
 include/linux/cgroup.h |    5 +++++
 kernel/cgroup.c        |   32 +++++++++++++++++++++++++-------
 mm/memcontrol.c        |    9 ++++++---
 3 files changed, 36 insertions(+), 10 deletions(-)

Index: mmotm-2.6.28-Jan7/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.28-Jan7.orig/include/linux/cgroup.h
+++ mmotm-2.6.28-Jan7/include/linux/cgroup.h
@@ -368,6 +368,11 @@ struct cgroup_subsys {
 	int disabled;
 	int early_init;
 	/*
+	 * set if subsys may retrun EBUSY while there are no tasks and
+	 * subsys knows it's very temporal reference.
+ 	 */
+	int retry_at_rmdir_failure;
+	/*
 	 * True if this subsys uses ID. ID is not available before cgroup_init()
 	 * (not available in early_init time.)
 	 */
Index: mmotm-2.6.28-Jan7/kernel/cgroup.c
===================================================================
--- mmotm-2.6.28-Jan7.orig/kernel/cgroup.c
+++ mmotm-2.6.28-Jan7/kernel/cgroup.c
@@ -2503,15 +2503,17 @@ static int cgroup_has_css_refs(struct cg
 
 /*
  * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * CSS_REMOVED. Return 0 on success, or false error code if the cgroup has
  * busy subsystems. Call with cgroup_mutex held
  */
 
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
-	struct cgroup_subsys *ss;
+	struct cgroup_subsys *ss, *failedhere;
 	unsigned long flags;
 	bool failed = false;
+	int err = -EBUSY;
+
 	local_irq_save(flags);
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -2521,6 +2523,7 @@ static int cgroup_clear_css_refs(struct 
 			refcnt = atomic_read(&css->refcnt);
 			if (refcnt > 1) {
 				failed = true;
+				failedhere = ss;
 				goto done;
 			}
 			BUG_ON(!refcnt);
@@ -2548,7 +2551,13 @@ static int cgroup_clear_css_refs(struct 
 		}
 	}
 	local_irq_restore(flags);
-	return !failed;
+
+	if (failed) {
+		if (failedhere->retry_at_rmdir_failure)
+			err = -EAGAIN;
+	} else
+		err = 0;
+	return err;
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -2556,9 +2565,10 @@ static int cgroup_rmdir(struct inode *un
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+retry:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2579,12 +2589,20 @@ static int cgroup_rmdir(struct inode *un
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
 
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	ret = cgroup_clear_css_refs(cgrp);
+	if (ret == -EBUSY) { /* really busy */
+		mutex_unlock(&cgroup_mutex);
+		return ret;
+	}
+	if (ret == -EAGAIN) { /* subsys asks us to retry later */
+		mutex_unlock(&cgroup_mutex);
+		cond_resched();
+		goto retry;
+	}
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
Index: mmotm-2.6.28-Jan7/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Jan7.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Jan7/mm/memcontrol.c
@@ -723,7 +723,8 @@ static int mem_cgroup_hierarchical_recla
 {
 	struct mem_cgroup *victim;
 	unsigned long start_age;
-	int ret, total = 0;
+	int ret = 0;
+	int total = 0;
 	/*
 	 * Reclaim memory from cgroups under root_mem in round robin.
 	 */
@@ -732,8 +733,9 @@ static int mem_cgroup_hierarchical_recla
 	while (time_after((start_age + 2UL), root_mem->scan_age)) {
 		victim = mem_cgroup_select_victim(root_mem);
 		/* we use swappiness of local cgroup */
-		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
-						   get_swappiness(victim));
+		if (victim->res.usage)
+			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+					noswap, get_swappiness(victim));
 		css_put(&victim->css);
 		total += ret;
 		if (mem_cgroup_check_under_limit(root_mem))
@@ -2261,6 +2263,7 @@ struct cgroup_subsys mem_cgroup_subsys =
 	.populate = mem_cgroup_populate,
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
+	.retry_at_rmdir_failure = 1,
 	.use_id = 1,
 };
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2009-01-08  9:36 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-01-08  9:25 [RFC][PATCH] cgroup and memcg updates 20090108 KAMEZAWA Hiroyuki
2009-01-08  9:28 ` [RFC][PATCH 1/4] cgroup: support per cgroup subsys state ID (CSS ID) KAMEZAWA Hiroyuki
2009-01-09  3:59   ` Li Zefan
2009-01-09  4:24     ` KAMEZAWA Hiroyuki
2009-01-10  0:23   ` Paul Menage
2009-01-10  0:49     ` KAMEZAWA Hiroyuki
2009-01-12  7:21   ` Balbir Singh
2009-01-15  6:12     ` KAMEZAWA Hiroyuki
2009-01-13  7:40   ` Li Zefan
2009-01-13  9:22     ` KAMEZAWA Hiroyuki
2009-01-08  9:30 ` [RFC][PATCH 2/4] memcg: use CSS ID in memcg KAMEZAWA Hiroyuki
2009-01-12 12:14   ` Balbir Singh
2009-01-15  6:19     ` KAMEZAWA Hiroyuki
2009-01-08  9:32 ` [RFC][PATCH 3/4] memcg: fix OOM KILL under hierarchy KAMEZAWA Hiroyuki
2009-01-13  8:33   ` Li Zefan
2009-01-13  9:25     ` KAMEZAWA Hiroyuki
2009-01-08  9:35 ` KAMEZAWA Hiroyuki [this message]
2009-01-14  2:48   ` [RFC][PATCH 4/4] cgroup-memcg fix frequent EBUSY at rmdir Paul Menage
2009-01-14  3:00     ` KAMEZAWA Hiroyuki
2009-01-14  3:05       ` Paul Menage
2009-01-14  3:12         ` KAMEZAWA Hiroyuki
2009-01-20 10:47           ` [RFC][PATCH 4/4] cgroup-memcg fix frequent EBUSY at rmdir v2 KAMEZAWA Hiroyuki
2009-01-21 10:00             ` Paul Menage
2009-01-21 10:32               ` KAMEZAWA Hiroyuki
2009-01-21 10:43                 ` Paul Menage
2009-01-21 10:45                   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090108183529.b4fd99f4.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox