linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"containers@lists.osdl.org" <containers@lists.osdl.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"yamamoto@valinux.co.jp" <yamamoto@valinux.co.jp>,
	"riel@redhat.com" <riel@redhat.com>,
	xemul@openvz.org
Subject: [RFC][for -mm] memory controller enhancements for reclaiming take2 [7/8] bacground reclaim for memory controller
Date: Mon, 3 Dec 2007 18:42:44 +0900	[thread overview]
Message-ID: <20071203184244.200faee8.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20071203183355.0061ddeb.kamezawa.hiroyu@jp.fujitsu.com>

High Low watermark for page reclaiming in memory cgroup(1) and background
reclaim routine.

 - If USAGE is bigger than high watermark, background reclaim starts.
 - If USAGE is lower than low watermark, background reclaim stops.

Each value is represented in bytes (See by kernel/res_counter.c)

Changelog v1 -> v2:
  - merged high/low patch and background reclaim patch.
  - removed automatic adjustement of high/low value.
  - cleanups.
  - add schedule_timedwait() in background reclaim's busy loop.
    (LRU scan can be very heavy workload and cosume CPUS.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
From: YAMAMOTO Takashi <yamamoto@valinux.co.jp>


 mm/memcontrol.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 5 deletions(-)

Index: linux-2.6.24-rc3-mm2/mm/memcontrol.c
===================================================================
--- linux-2.6.24-rc3-mm2.orig/mm/memcontrol.c
+++ linux-2.6.24-rc3-mm2/mm/memcontrol.c
@@ -30,6 +30,8 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
@@ -117,11 +119,6 @@ struct mem_cgroup_lru_info {
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
@@ -130,6 +127,13 @@ struct mem_cgroup {
 	 */
 	struct res_counter res;
 	/*
+	 * background reclaim
+	 */
+	struct {
+		wait_queue_head_t waitq;
+		struct task_struct *thread;
+	} daemon;
+	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
@@ -401,6 +405,17 @@ static void __mem_cgroup_move_lists(stru
 	}
 }
 
+static void
+mem_cgroup_schedule_reclaim(struct mem_cgroup *mem)
+{
+	if (!unlikely(mem->daemon.thread))
+		return;
+	if (!waitqueue_active(&mem->daemon.waitq))
+		return;
+	wake_up_interruptible(&mem->daemon.waitq);
+}
+
+
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
@@ -708,6 +723,8 @@ noreclaim:
 			mem_cgroup_out_of_memory(mem, GFP_KERNEL);
 		goto free_pc;
 	}
+	if (res_counter_above_high_watermark(&mem->res))
+		mem_cgroup_schedule_reclaim(mem);
 
 	atomic_set(&pc->ref_cnt, 1);
 	pc->mem_cgroup = mem;
@@ -863,6 +880,42 @@ retry:
 }
 
 /*
+ * Background page reclaim daeom for memory controller.
+ */
+
+static int mem_cgroup_reclaim_daemon(void *data)
+{
+	DEFINE_WAIT(wait);
+	struct mem_cgroup *mem = data;
+
+	css_get(&mem->css);
+	current->flags |= PF_SWAPWRITE;
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		prepare_to_wait(&mem->daemon.waitq, &wait, TASK_INTERRUPTIBLE);
+		if (res_counter_below_low_watermark(&mem->res)) {
+			if (!kthread_should_stop()) {
+				schedule();
+				try_to_freeze();
+			}
+			finish_wait(&mem->daemon.waitq, &wait);
+			continue;
+		}
+		finish_wait(&mem->daemon.waitq, &wait);
+		try_to_free_mem_cgroup_pages(mem, GFP_HIGHUSER_MOVABLE);
+		/*
+		 * throttle LRU scan for moderate reclaiming.
+		 * not congestion wait.
+		 */
+		schedule_timeout(HZ/10);
+	}
+
+	css_put(&mem->css);
+	return 0;
+}
+
+/*
  * This routine traverse page_cgroup in given list and drop them all.
  * This routine ignores page_cgroup->ref_cnt.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -1136,6 +1189,18 @@ static struct cftype mem_cgroup_files[] 
 		.read = mem_control_type_read,
 	},
 	{
+		.name = "low_watermark_in_bytes",
+		.private = RES_LWMARK,
+		.write = mem_cgroup_write,
+		.read = mem_cgroup_read,
+	},
+	{
+		.name = "high_watermark_in_bytes",
+		.private = RES_HWMARK,
+		.write = mem_cgroup_write,
+		.read = mem_cgroup_read,
+	},
+	{
 		.name = "force_empty",
 		.write = mem_force_empty_write,
 		.read = mem_force_empty_read,
@@ -1186,6 +1251,16 @@ static void free_mem_cgroup_per_zone_inf
 
 static struct mem_cgroup init_mem_cgroup;
 
+static int __init mem_cgroup_reclaim_init(void)
+{
+	init_mem_cgroup.daemon.thread = kthread_run(mem_cgroup_reclaim_daemon,
+					&init_mem_cgroup, "memcontd");
+	if (IS_ERR(init_mem_cgroup.daemon.thread))
+		BUG();
+	return 0;
+}
+late_initcall(mem_cgroup_reclaim_init);
+
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
@@ -1213,6 +1288,17 @@ mem_cgroup_create(struct cgroup_subsys *
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 
+	/* Memory Reclaim Daemon per cgroup */
+	init_waitqueue_head(&mem->daemon.waitq);
+	if (mem != &init_mem_cgroup) {
+		/* Complicated...but we cannot call kthread create here..*/
+		/* init call will later assign kthread */
+		mem->daemon.thread = kthread_run(mem_cgroup_reclaim_daemon,
+					mem, "memcontd");
+		if (IS_ERR(mem->daemon.thread))
+			goto free_out;
+	}
+
 	return &mem->css;
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
@@ -1227,6 +1313,7 @@ static void mem_cgroup_pre_destroy(struc
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_force_empty(mem);
+	kthread_stop(mem->daemon.thread);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2007-12-03  9:42 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-12-03  9:33 [RFC][for -mm] memory controller enhancements for reclaiming take2 [0/8] introduction KAMEZAWA Hiroyuki
2007-12-03  9:35 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [1/8] clean up : remove unused variable KAMEZAWA Hiroyuki
2007-12-04 15:55   ` Balbir Singh
2007-12-03  9:36 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [2/8] add BUG_ON() in mem_cgroup_zoneinfo KAMEZAWA Hiroyuki
2007-12-04 16:01   ` Balbir Singh
2007-12-03  9:37 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [3/8] define free_mem_cgroup_per_zone_info KAMEZAWA Hiroyuki
2007-12-04 16:32   ` Balbir Singh
2007-12-03  9:38 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [4/8] possible race fix in res_counter KAMEZAWA Hiroyuki
2007-12-04 19:02   ` Balbir Singh
2007-12-03  9:39 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [5/8] throttling simultaneous callers of try_to_free_mem_cgroup_pages KAMEZAWA Hiroyuki
2007-12-03 14:24   ` Rik van Riel
2007-12-04  1:33     ` KAMEZAWA Hiroyuki
2007-12-04 13:27       ` Balbir Singh
2007-12-05  0:26         ` KAMEZAWA Hiroyuki
2007-12-03  9:41 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [6/8] high_low watermark for res_counter KAMEZAWA Hiroyuki
2007-12-03  9:42 ` KAMEZAWA Hiroyuki [this message]
2007-12-04  3:07   ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [7/8] bacground reclaim for memory controller YAMAMOTO Takashi
2007-12-04  3:18     ` KAMEZAWA Hiroyuki
2007-12-04  3:31       ` YAMAMOTO Takashi
2007-12-03  9:45 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [8/8] wake up waiters at unchage KAMEZAWA Hiroyuki
2007-12-04  6:46 ` [RFC][for -mm] memory controller enhancements for reclaiming take2 [0/8] introduction KAMEZAWA Hiroyuki
2007-12-04 14:25 ` Balbir Singh
2007-12-05  0:44   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071203184244.200faee8.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=containers@lists.osdl.org \
    --cc=linux-mm@kvack.org \
    --cc=riel@redhat.com \
    --cc=xemul@openvz.org \
    --cc=yamamoto@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox