From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
"xemul@openvz.org" <xemul@openvz.org>,
"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
"yamamoto@valinux.co.jp" <yamamoto@valinux.co.jp>,
"hugh@veritas.com" <hugh@veritas.com>,
"kosaki.motohiro@jp.fujitsu.com" <kosaki.motohiro@jp.fujitsu.com>
Subject: [RFC][-mm] [7/7] background job for memcg
Date: Wed, 2 Jul 2008 21:17:38 +0900 [thread overview]
Message-ID: <20080702211738.f2ae389b.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20080702210322.518f6c43.kamezawa.hiroyu@jp.fujitsu.com>
Background relcaim for memcg.
This patch adds a daemon to do background page reclaim based on high/low
watermark for memcg. Almost all codes are rewritten from previous one.
So start from numbering v1, again.
Major changes from old one:
i)changes the reclaim to be started based on distance to the limit.
Not to the amount of pages.
By this, we don't need strict check against low < high < limit.
low and high are guaranteed to be below limit.
ii)"A" daemon is used instead of daemons per memcg.
maybe simpler than previous ones. (But maybe it's ok to start per-node
thread. It's TODO.)
iii) Because of ii), memcg->flags is added.
Note: I tried to use work_queue but it seems it's not suitable for a work
with loop. So, I added kthread.
BTW, if you think of better name, please tell me.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
mm/memcontrol.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 196 insertions(+), 5 deletions(-)
Index: test-2.6.26-rc5-mm3++/mm/memcontrol.c
===================================================================
--- test-2.6.26-rc5-mm3++.orig/mm/memcontrol.c
+++ test-2.6.26-rc5-mm3++/mm/memcontrol.c
@@ -11,7 +11,7 @@
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
- * This program is distributed in the hope that it will be useful,
+ * This program is distributed inx the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
@@ -33,13 +33,31 @@
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
-
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
static struct kmem_cache *page_cgroup_cache __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
+/* Background reclaim stuff */
+static LIST_HEAD(memcg_global_mswapd_list);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_mswapdq);
+static DEFINE_SPINLOCK(memcg_mswapd_lock);
+struct task_struct *mswapd_daemon __read_mostly;
+
+enum {
+ MEMCG_HWMARK = 10000,
+ MEMCG_LWMARK = 10001,
+};
+
+
+/* An interface for waiting the end of background reclaim */
+static DECLARE_WAIT_QUEUE_HEAD(memcg_destroy_waitq);
+
/*
* Statistics for memory cgroup.
*/
@@ -129,6 +147,11 @@ struct mem_cgroup {
struct mem_cgroup_lru_info info;
int prev_priority; /* for recording reclaim priority */
+ unsigned long flags;
+ /* background reclaim stuff */
+ unsigned long long highwmrk_distance;
+ unsigned long long lowwmrk_distance;
+ struct list_head mswapd_list;
/*
* statistics.
*/
@@ -136,6 +159,13 @@ struct mem_cgroup {
};
static struct mem_cgroup init_mem_cgroup;
+/* Flag bit for memcg itself */
+enum {
+ MEMCG_FLAG_IN_RECLAIM,
+ MEMCG_FLAG_OBSOLETE,
+};
+
+
/*
* We use the lower bit of the page->page_cgroup pointer as a bit spin
* lock. We need to ensure that page->page_cgroup is at least two
@@ -504,6 +534,37 @@ unsigned long mem_cgroup_isolate_pages(u
return nr_taken;
}
+
+static void mem_cgroup_schedule_reclaim(struct mem_cgroup *memcg)
+{
+ unsigned long flags;
+
+ if (unlikely(!mswapd_daemon))
+ return;
+
+ if (!test_and_set_bit(MEMCG_FLAG_IN_RECLAIM, &memcg->flags)) {
+ /* When OBSOLETE is marked, there is no thread in this group */
+ BUG_ON(test_bit(MEMCG_FLAG_OBSOLETE, &memcg->flags));
+
+ spin_lock_irqsave(&memcg_mswapd_lock, flags);
+ BUG_ON(!list_empty(&memcg->mswapd_list));
+ css_get(&memcg->css);
+ list_add_tail(&memcg->mswapd_list, &memcg_global_mswapd_list);
+ spin_unlock_irqrestore(&memcg_mswapd_lock, flags);
+ if (!waitqueue_active(&memcg_mswapdq))
+ return;
+ wake_up_interruptible(&memcg_mswapdq);
+ }
+}
+
+
+static void mem_cgroup_check_distance_to_limit(struct mem_cgroup *memcg,
+ unsigned long long distance)
+{
+ if (distance < memcg->highwmrk_distance)
+ mem_cgroup_schedule_reclaim(memcg);
+}
+
/*
* Charge the memory controller for page usage.
* Return
@@ -518,6 +579,7 @@ static int mem_cgroup_charge_common(stru
struct page_cgroup *pc;
unsigned long flags;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned long long distance;
struct mem_cgroup_per_zone *mz;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
@@ -543,7 +605,7 @@ static int mem_cgroup_charge_common(stru
css_get(&memcg->css);
}
- while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+ while (res_counter_charge_distance(&mem->res, PAGE_SIZE, &distance)) {
if (!(gfp_mask & __GFP_WAIT))
goto out;
@@ -596,6 +658,9 @@ static int mem_cgroup_charge_common(stru
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(page);
+ /* O.K. we successfully charged. check thresholds...
+ should check gfp flag ? */
+ mem_cgroup_check_distance_to_limit(mem, distance);
done:
return 0;
out:
@@ -834,6 +899,68 @@ int mem_cgroup_shrink_usage(struct mm_st
return 0;
}
+/*
+ * a daemon to do backgound page shrinking within memcg.
+ */
+static int memcg_mswapd(void *data)
+{
+ DEFINE_WAIT(wait);
+ int ret;
+ struct mem_cgroup *memcg;
+ unsigned long distance;
+
+ current->flags |= PF_SWAPWRITE;
+ set_user_nice(current, 0);
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ prepare_to_wait(&memcg_mswapdq, &wait, TASK_INTERRUPTIBLE);
+
+ /* Is there scheduled one ? */
+ spin_lock_irq(&memcg_mswapd_lock);
+ if (list_empty(&memcg_global_mswapd_list)) {
+ spin_unlock_irq(&memcg_mswapd_lock);
+ if (!kthread_should_stop()) {
+ schedule();
+ try_to_freeze();
+ }
+ finish_wait(&memcg_mswapdq, &wait);
+ continue;
+ }
+ memcg = container_of(memcg_global_mswapd_list.next,
+ struct mem_cgroup, mswapd_list);
+ list_del_init(&memcg->mswapd_list);
+ spin_unlock_irq(&memcg_mswapd_lock);
+
+ finish_wait(&memcg_mswapdq, &wait);
+
+ if (!test_bit(MEMCG_FLAG_OBSOLETE, &memcg->flags)) {
+ ret = try_to_free_mem_cgroup_pages(memcg,
+ GFP_HIGHUSER_MOVABLE);
+ distance = res_counter_distance_to_limit(&memcg->res);
+ } else
+ distance = 0;
+
+ if (distance < memcg->lowwmrk_distance) {
+ /* Don't clear IN_RECLAIM flag and add to tail */
+ spin_lock_irq(&memcg_mswapd_lock);
+ list_add_tail(&memcg->mswapd_list,
+ &memcg_global_mswapd_list);
+ spin_unlock_irq(&memcg_mswapd_lock);
+ } else {
+ css_put(&memcg->css);
+ clear_bit(MEMCG_FLAG_IN_RECLAIM, &memcg->flags);
+ wake_up_all(&memcg_destroy_waitq);
+ }
+ yield();
+ }
+ /* currently, stop this thread is not implemented, but maybe
+ in future. */
+ BUG();
+ return 0;
+}
+
+
int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
{
@@ -931,8 +1058,17 @@ out:
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
- return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
- cft->private);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+
+ switch (cft->private) {
+ case MEMCG_HWMARK:
+ return memcg->highwmrk_distance;
+ case MEMCG_LWMARK:
+ return memcg->lowwmrk_distance;
+ default:
+ break;
+ }
+ return res_counter_read_u64(&memcg->res, cft->private);
}
/*
* The user of this function is...
@@ -952,6 +1088,24 @@ static int mem_cgroup_write(struct cgrou
if (!ret)
ret = mem_cgroup_resize_limit(memcg, val);
break;
+ case MEMCG_HWMARK:
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (!ret) {
+ if (val <= memcg->lowwmrk_distance)
+ memcg->highwmrk_distance = val;
+ else
+ ret = -EINVAL;
+ }
+ break;
+ case MEMCG_LWMARK:
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (!ret) {
+ if (val >= memcg->highwmrk_distance)
+ memcg->lowwmrk_distance = val;
+ else
+ ret = -EINVAL;
+ }
+ break;
default:
ret = -EINVAL; /* should be BUG() ? */
break;
@@ -1063,6 +1217,18 @@ static struct cftype mem_cgroup_files[]
.name = "stat",
.read_map = mem_control_stat_show,
},
+ {
+ .name = "start_reclaim_distance",
+ .private = MEMCG_HWMARK,
+ .write_string = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "stop_reclaim_distance",
+ .private = MEMCG_LWMARK,
+ .write_string = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read,
+ },
};
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
@@ -1124,6 +1290,19 @@ static void mem_cgroup_free(struct mem_c
vfree(mem);
}
+static int mem_cgroup_start_daemon(void)
+{
+ struct task_struct *result;
+ int ret = 0;
+ result = kthread_run(memcg_mswapd, NULL, "memcontrol");
+ if (IS_ERR(result)) {
+ printk("failed to start memory controller daemon\n");
+ mswapd_daemon = NULL;
+ } else
+ mswapd_daemon = result;
+ return ret;
+}
+late_initcall(mem_cgroup_start_daemon);
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1146,12 +1325,15 @@ mem_cgroup_create(struct cgroup_subsys *
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
+ /* mem->flags is cleared by memset() */
+
return &mem->css;
free_out:
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
if (cont->parent != NULL)
mem_cgroup_free(mem);
+
return ERR_PTR(-ENOMEM);
}
@@ -1159,7 +1341,16 @@ static void mem_cgroup_pre_destroy(struc
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ set_bit(MEMCG_FLAG_OBSOLETE, &mem->flags);
+ if (mswapd_daemon)
+ wake_up(mswapd_daemon);
+ /* wait for being removed from background reclaim queue */
+ wait_event_interruptible(memcg_destroy_waitq,
+ !(test_bit(MEMCG_FLAG_IN_RECLAIM, &mem->flags)));
+
mem_cgroup_force_empty(mem);
+
}
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2008-07-02 12:17 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-07-02 12:03 [RFC][-mm] [0/7] misc memcg patch set KAMEZAWA Hiroyuki
2008-07-02 12:07 ` [RFC][-mm] [1/7] shmem swapcache fix KAMEZAWA Hiroyuki
2008-07-02 12:08 ` [RFC][-mm] [2/7] delete FLAG_CACHE KAMEZAWA Hiroyuki
2008-07-02 12:10 ` [RFC][-mm] [3/7] add shmem page to active list KAMEZAWA Hiroyuki
2008-07-03 0:11 ` KAMEZAWA Hiroyuki
2008-07-03 4:27 ` KAMEZAWA Hiroyuki
2008-07-03 7:03 ` Hugh Dickins
2008-07-03 7:43 ` KAMEZAWA Hiroyuki
2008-07-03 22:28 ` Hugh Dickins
2008-07-04 0:47 ` KAMEZAWA Hiroyuki
2008-07-02 12:12 ` [RFC][-mm] [4/7] handle limit change KAMEZAWA Hiroyuki
2008-07-02 12:13 ` [RFC][-mm] [5/7] reduce usage at " KAMEZAWA Hiroyuki
2008-07-02 12:15 ` [RFC][-mm] [6/7] res_counter distance to limit KAMEZAWA Hiroyuki
2008-07-02 19:19 ` Paul Menage
2008-07-02 23:57 ` KAMEZAWA Hiroyuki
2008-07-02 12:17 ` KAMEZAWA Hiroyuki [this message]
2008-07-02 14:23 ` [RFC][-mm] [0/7] misc memcg patch set Daisuke Nishimura
2008-07-02 16:31 ` KOSAKI Motohiro
2008-07-02 23:56 ` KAMEZAWA Hiroyuki
2008-07-03 2:38 ` KAMEZAWA Hiroyuki
2008-07-03 3:06 ` KAMEZAWA Hiroyuki
2008-07-03 9:07 ` Balbir Singh
2008-07-03 9:54 ` KAMEZAWA Hiroyuki
2008-07-11 8:38 ` KAMEZAWA Hiroyuki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080702211738.f2ae389b.kamezawa.hiroyu@jp.fujitsu.com \
--to=kamezawa.hiroyu@jp.fujitsu.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=hugh@veritas.com \
--cc=kosaki.motohiro@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=nishimura@mxp.nes.nec.co.jp \
--cc=xemul@openvz.org \
--cc=yamamoto@valinux.co.jp \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox