linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	Ying Han <yinghan@google.com>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>
Subject: [RFC][PATCH v3 1/10] check reclaimable in hierarchy walk
Date: Thu, 26 May 2011 14:15:29 +0900	[thread overview]
Message-ID: <20110526141529.53b70097.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20110526141047.dc828124.kamezawa.hiroyu@jp.fujitsu.com>


I may post this patch as stand alone, later.
==
Check memcg has reclaimable pages at select_victim().

Now, with help of bitmap as memcg->scan_node, we can check whether memcg has
reclaimable pages with easy test of node_empty(&mem->scan_nodes).

mem->scan_nodes is a bitmap to show whether memcg contains reclaimable
memory or not, which is updated periodically.

This patch makes use of scan_nodes and modify hierarchy walk at memory
shrinking in following way.

  - check scan_nodes in mem_cgroup_select_victim()
  - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable.
  - force update of scan_nodes.
  - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim()
    to show refcnt is +1.

This will make hierarchy walk better.

And this allows to remove mem_cgroup_local_pages() check which was used for
the same purpose. But this function was wrong because it cannot handle
information of unevictable pages and tmpfs v.s. swapless information.

Changelog:
 - added since v3.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |  165 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 110 insertions(+), 55 deletions(-)

Index: memcg_async/mm/memcontrol.c
===================================================================
--- memcg_async.orig/mm/memcontrol.c
+++ memcg_async/mm/memcontrol.c
@@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct 
 	return val;
 }
 
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-	long ret;
-
-	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-	return ret;
-}
-
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
@@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro
 	return min(limit, memsw);
 }
 
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
-{
-	struct mem_cgroup *ret = NULL;
-	struct cgroup_subsys_state *css;
-	int nextid, found;
-
-	if (!root_mem->use_hierarchy) {
-		css_get(&root_mem->css);
-		ret = root_mem;
-	}
-
-	while (!ret) {
-		rcu_read_lock();
-		nextid = root_mem->last_scanned_child + 1;
-		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
-				   &found);
-		if (css && css_tryget(css))
-			ret = container_of(css, struct mem_cgroup, css);
-
-		rcu_read_unlock();
-		/* Updates scanning parameter */
-		if (!css) {
-			/* this means start scan from ID:1 */
-			root_mem->last_scanned_child = 0;
-		} else
-			root_mem->last_scanned_child = found;
-	}
-
-	return ret;
-}
-
 #if MAX_NUMNODES > 1
 
 /*
@@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
 {
 	int nid;
 
-	if (time_after(mem->next_scan_node_update, jiffies))
+	if (!force && time_after(mem->next_scan_node_update, jiffies))
 		return;
 
 	mem->next_scan_node_update = jiffies + 10*HZ;
@@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct
 {
 	int node;
 
-	mem_cgroup_may_update_nodemask(mem);
+	mem_cgroup_may_update_nodemask(mem, false);
 	node = mem->last_scanned_node;
 
 	node = next_node(node, mem->scan_nodes);
@@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct
 	return node;
 }
 
+/**
+ * mem_cgroup_has_reclaimable
+ * @mem_cgroup : the mem_cgroup
+ *
+ * The caller can test whether the memcg has reclaimable pages.
+ *
+ * This function checks memcg has reclaimable pages or not with bitmap of
+ * memcg->scan_nodes. This bitmap is updated periodically and indicates
+ * which node has reclaimable memcg memory or not.
+ * Although this is a rough test and result is not very precise but we don't
+ * have to scan all nodes and don't have to use locks.
+ *
+ * For non-NUMA, this cheks reclaimable pages on zones because we don't
+ * update scan_nodes.(see below)
+ */
+static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
+{
+	return !nodes_empty(memcg->scan_nodes);
+}
+
 #else
+
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
+{
+}
+
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
 	return 0;
 }
+
+static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
+{
+	unsigned long nr;
+	int zid;
+
+	for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--)
+		if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid))
+			break;
+	if (zid < 0)
+		return false;
+	return true;
+}
 #endif
 
+/**
+ * mem_cgroup_select_get_victim
+ * @root_mem: the root memcg of hierarchy which should be shrinked.
+ *
+ * Visit children of root_mem ony by one. If the routine finds a memcg
+ * which contains reclaimable pages, returns it with refcnt +1. The
+ * scan is done in round-robin and 'the next start point' is saved into
+ * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL.
+ */
+static struct mem_cgroup *
+mem_cgroup_select_get_victim(struct mem_cgroup *root_mem)
+{
+	struct mem_cgroup *ret = NULL;
+	struct cgroup_subsys_state *css;
+	int nextid, found;
+	bool second_visit = false;
+
+	if (!root_mem->use_hierarchy)
+		goto return_root;
+
+	while (!ret) {
+		rcu_read_lock();
+		nextid = root_mem->last_scanned_child + 1;
+		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+				   &found);
+		if (css && css_tryget(css))
+			ret = container_of(css, struct mem_cgroup, css);
+
+		rcu_read_unlock();
+		/* Updates scanning parameter */
+		if (!css) { /* Indicates we scanned the last node of tree */
+			/*
+			 * If all memcg has no reclaimable pages, we may enter
+			 * an infinite loop. Exit here if we reached the end
+			 * of hierarchy tree twice.
+			 */
+			if (second_visit)
+				return NULL;
+			/* this means start scan from ID:1 */
+			root_mem->last_scanned_child = 0;
+			second_visit = true;
+		} else
+			root_mem->last_scanned_child = found;
+		if (css && ret) {
+			/*
+ 			 * check memcg has reclaimable memory or not. Update
+ 			 * information carefully if we might fail with cached
+ 			 * bitmask information.
+ 			 */
+			if (second_visit)
+				mem_cgroup_may_update_nodemask(ret, true);
+
+			if (!mem_cgroup_has_reclaimable(ret)) {
+				css_put(css);
+				ret = NULL;
+			}
+		}
+	}
+
+	return ret;
+return_root:
+	css_get(&root_mem->css);
+	return root_mem;
+}
+
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla
 		is_kswapd = true;
 
 	while (1) {
-		victim = mem_cgroup_select_victim(root_mem);
+		victim = mem_cgroup_select_get_victim(root_mem);
+		if (!victim)
+			return total;
 		if (victim == root_mem) {
 			loop++;
 			if (loop >= 1)
@@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla
 				}
 			}
 		}
-		if (!mem_cgroup_local_usage(victim)) {
-			/* this cgroup's local usage == 0 */
-			css_put(&victim->css);
-			continue;
-		}
 		/* we use swappiness of local cgroup */
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2011-05-26  5:22 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-05-26  5:10 [RFC][PATCH v3 0/10] memcg async reclaim KAMEZAWA Hiroyuki
2011-05-26  5:15 ` KAMEZAWA Hiroyuki [this message]
2011-05-27  5:47   ` [RFC][PATCH v3 1/10] check reclaimable in hierarchy walk Ying Han
2011-05-26  5:18 ` [RFC][PATCH v3 2/10] memcg: fix cached charge drain ratio KAMEZAWA Hiroyuki
2011-05-26  5:19 ` [RFC][PATCH v3 3/10] memcg: a test whether zone is reclaimable or not KAMEZAWA Hiroyuki
2011-05-27  7:21   ` Ying Han
2011-05-27  8:25     ` KAMEZAWA Hiroyuki
2011-05-26  5:20 ` [RFC][PATCH v3 4/10] memcg: export swappiness KAMEZAWA Hiroyuki
2011-05-26  5:23 ` [RFC][PATCH v3 5/10] memcg keep margin to limit in background KAMEZAWA Hiroyuki
2011-05-26  5:24 ` [RFC][PATCH v3 6/10] memcg : auto keep margin in background , workqueue core KAMEZAWA Hiroyuki
2011-05-26  5:30 ` [RFC][PATCH v3 7/10] workqueue: add WQ_IDLEPRI KAMEZAWA Hiroyuki
2011-05-26  9:38   ` Tejun Heo
2011-05-26 10:30     ` KAMEZAWA Hiroyuki
2011-05-26 10:50       ` KAMEZAWA Hiroyuki
2011-05-26 11:44         ` Tejun Heo
2011-05-26 23:41           ` KAMEZAWA Hiroyuki
2011-05-27 20:20     ` Peter Zijlstra
2011-05-26  5:32 ` [RFC][PATCH v3 8/10] memcg: scan ratio calculation KAMEZAWA Hiroyuki
2011-05-26  5:35 ` [RFC][PATCH v3 9/10] memcg: scan limited memory reclaim KAMEZAWA Hiroyuki
2011-05-26  5:36 ` [RFC][PATCH v3 10/10] memcg : reclaim statistics KAMEZAWA Hiroyuki
2011-05-27  1:13   ` Ying Han
2011-05-27  1:17   ` Ying Han
2011-05-27  1:12     ` KAMEZAWA Hiroyuki
2011-05-27  1:14     ` KAMEZAWA Hiroyuki
2011-05-27  1:22       ` Ying Han
2011-05-27  1:49 ` [RFC][PATCH v3 0/10] memcg async reclaim Ying Han
2011-05-27  2:16   ` KAMEZAWA Hiroyuki
2011-05-27  4:33     ` Ying Han
2011-05-27  4:34       ` KAMEZAWA Hiroyuki
2011-05-27  4:49         ` Ying Han
2011-05-27  7:20           ` Ying Han
2011-05-31 17:04             ` Ying Han
2011-05-27  2:48   ` KAMEZAWA Hiroyuki
2011-05-27  3:05     ` KAMEZAWA Hiroyuki
2011-05-27  4:34       ` Ying Han

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110526141529.53b70097.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=yinghan@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox