[PATCH 3/3] memcg: change hierarhcy managenemt to use scan by cgroup ID

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"lizf@cn.fujitsu.com" <lizf@cn.fujitsu.com>,
	"menage@google.com" <menage@google.com>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>
Subject: [PATCH 3/3] memcg: change hierarhcy managenemt to use scan by cgroup ID
Date: Mon, 1 Dec 2008 15:04:06 +0900	[thread overview]
Message-ID: <20081201150406.81b10fc4.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20081201145907.e6d63d61.kamezawa.hiroyu@jp.fujitsu.com>

Implement hierarchy reclaim by cgroup_id.

TODO:
 	- memsw support isn't good. (maybe using Nishimura's patch is good.)

What changes:
	- reclaim is not done by tree-walk algorithm
	- mem_cgroup->last_schan_child is ID, not pointer.
	- no cgroup_lock.
	- scanning order is just defined by ID's order.
	  (Scan by round-robin logic.)

Changelog: v1 -> v2
	- make use of css_tryget();
	- count # of loops rather than remembering position.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujisu.com>


 mm/memcontrol.c |  176 +++++++++++++++++---------------------------------------
 1 file changed, 55 insertions(+), 121 deletions(-)

Index: mmotm-2.6.28-Nov29/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Nov29.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Nov29/mm/memcontrol.c
@@ -146,9 +146,11 @@ struct mem_cgroup {
 
 	/*
 	 * While reclaiming in a hiearchy, we cache the last child we
-	 * reclaimed from. Protected by cgroup_lock()
+	 * reclaimed from.
 	 */
-	struct mem_cgroup *last_scanned_child;
+	spinlock_t	scan_lock;
+	int	last_scan_child;
+	unsigned long	scan_age;
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
@@ -475,104 +477,44 @@ unsigned long mem_cgroup_isolate_pages(u
 	container_of(counter, struct mem_cgroup, member)
 
 /*
- * This routine finds the DFS walk successor. This routine should be
- * called with cgroup_mutex held
+ * This routine select next memcg by ID. Using RCU and tryget().
+ * No cgroup_mutex is required.
  */
 static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
+mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 {
-	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
-
-	curr_cgroup = curr->css.cgroup;
-	root_cgroup = root_mem->css.cgroup;
-
-	if (!list_empty(&curr_cgroup->children)) {
-		/*
-		 * Walk down to children
-		 */
-		mem_cgroup_put(curr);
-		cgroup = list_entry(curr_cgroup->children.next,
-						struct cgroup, sibling);
-		curr = mem_cgroup_from_cont(cgroup);
-		mem_cgroup_get(curr);
-		goto done;
-	}
-
-visit_parent:
-	if (curr_cgroup == root_cgroup) {
-		mem_cgroup_put(curr);
-		curr = root_mem;
-		mem_cgroup_get(curr);
-		goto done;
-	}
-
-	/*
-	 * Goto next sibling
-	 */
-	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
-		mem_cgroup_put(curr);
-		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
-						sibling);
-		curr = mem_cgroup_from_cont(cgroup);
-		mem_cgroup_get(curr);
-		goto done;
-	}
-
-	/*
-	 * Go up to next parent and next parent's sibling if need be
-	 */
-	curr_cgroup = curr_cgroup->parent;
-	goto visit_parent;
-
-done:
-	root_mem->last_scanned_child = curr;
-	return curr;
-}
-
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
-{
-	struct cgroup *cgroup;
+	struct cgroup *cgroup, *root_cgroup;
 	struct mem_cgroup *ret;
-	struct mem_cgroup *last_scan = root_mem->last_scanned_child;
-	bool obsolete = false;
+	int nextid, rootid, depth, found;
+	unsigned long flags;
 
-	if (last_scan) {
-		if (css_under_removal(&last_scan->css))
-			obsolete = true;
-	} else
-		obsolete = true;
+	root_cgroup = root_mem->css.cgroup;
+	rootid = cgroup_id(root_cgroup);
+	depth = cgroup_depth(root_cgroup);
+	found = 0;
+	ret = NULL;
+	rcu_read_lock();
 
-	/*
-	 * Scan all children under the mem_cgroup mem
-	 */
-	cgroup_lock();
-	if (list_empty(&root_mem->css.cgroup->children)) {
-		ret = root_mem;
-		goto done;
+	while (!ret) {
+		/* ID:0 is not used by cgroup-id */
+		nextid = root_mem->last_scan_child + 1;
+		cgroup = cgroup_get_next(nextid, rootid, depth, &found);
+		if (cgroup) {
+			spin_lock_irqsave(&root_mem->scan_lock, flags);
+			root_mem->last_scan_child = found;
+			spin_unlock_irqrestore(&root_mem->scan_lock, flags);
+			ret = mem_cgroup_from_cont(cgroup);
+			if (!css_tryget(&ret->css))
+				ret = NULL;
+		} else {
+			spin_lock_irqsave(&root_mem->scan_lock, flags);
+			root_mem->scan_age++;
+			root_mem->last_scan_child = 0;
+			spin_unlock_irqrestore(&root_mem->scan_lock, flags);
+		}
 	}
+	rcu_read_unlock();
 
-	if (!root_mem->last_scanned_child || obsolete) {
-
-		if (obsolete)
-			mem_cgroup_put(root_mem->last_scanned_child);
-
-		cgroup = list_first_entry(&root_mem->css.cgroup->children,
-				struct cgroup, sibling);
-		ret = mem_cgroup_from_cont(cgroup);
-		mem_cgroup_get(ret);
-	} else
-		ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
-						root_mem);
-
-done:
-	root_mem->last_scanned_child = ret;
-	cgroup_unlock();
 	return ret;
 }
 
@@ -586,37 +528,25 @@ done:
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 						gfp_t gfp_mask, bool noswap)
 {
-	struct mem_cgroup *next_mem;
+	struct mem_cgroup *victim;
+	unsigned long start_age;
 	int ret = 0;
+	int total = 0;
 
-	/*
-	 * Reclaim unconditionally and don't check for return value.
-	 * We need to reclaim in the current group and down the tree.
-	 * One might think about checking for children before reclaiming,
-	 * but there might be left over accounting, even after children
-	 * have left.
-	 */
-	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
-	if (res_counter_check_under_limit(&root_mem->res))
-		return 0;
-
-	next_mem = mem_cgroup_get_first_node(root_mem);
-
-	while (next_mem != root_mem) {
-		if (css_under_removal(&next_mem->css)) {
-			mem_cgroup_put(next_mem);
-			cgroup_lock();
-			next_mem = mem_cgroup_get_first_node(root_mem);
-			cgroup_unlock();
-			continue;
-		}
-		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
+	start_age = root_mem->scan_age;
+	/* allows 2 times of loops */
+	while (time_after((start_age + 2UL), root_mem->scan_age)) {
+		victim = mem_cgroup_select_victim(root_mem);
+		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap);
+		css_put(&victim->css);
 		if (res_counter_check_under_limit(&root_mem->res))
-			return 0;
-		cgroup_lock();
-		next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
-		cgroup_unlock();
+			return 1;
+		total += ret;
 	}
+	ret = total;
+	if (res_counter_check_under_limit(&root_mem->res))
+		ret = 1;
+
 	return ret;
 }
 
@@ -705,6 +635,8 @@ static int __mem_cgroup_try_charge(struc
 
 		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
 							noswap);
+		if (ret)
+			continue;
 
 		/*
 		 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -1981,8 +1913,9 @@ mem_cgroup_create(struct cgroup_subsys *
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
 	}
-
-	mem->last_scanned_child = NULL;
+	spin_lock_init(&mem->scan_lock);
+	mem->last_scan_child = 0;
+	mem->scan_age = 0;
 
 	return &mem->css;
 free_out:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2008-12-01  6:04 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-12-01  5:59 [PATCH 0/3] cgroup id and scanning without cgroup_lock KAMEZAWA Hiroyuki
2008-12-01  6:02 ` [PATCH 1/3] cgroup: fix pre_destroy and semantics of css->refcnt KAMEZAWA Hiroyuki
2008-12-02  6:15   ` Li Zefan
2008-12-02  6:21     ` KAMEZAWA Hiroyuki
2008-12-02  6:56       ` Li Zefan
2008-12-02  7:13         ` KAMEZAWA Hiroyuki
2008-12-02  7:31           ` Li Zefan
2008-12-02  7:39             ` KAMEZAWA Hiroyuki
2008-12-03  3:44   ` Li Zefan
2008-12-03  3:54     ` KAMEZAWA Hiroyuki
2008-12-01  6:03 ` [PATCH 2/3] cgroup: cgroup ID and scanning under RCU KAMEZAWA Hiroyuki
2008-12-01  6:04 ` KAMEZAWA Hiroyuki [this message]
2008-12-01  6:24 ` [PATCH 0/3] cgroup id and scanning without cgroup_lock Balbir Singh
2008-12-01  7:52   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20081201150406.81b10fc4.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox