* [PATCH 1/8] memcg: introduce mem_cgroup_cancel_charge()
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
@ 2009-09-17 2:24 ` Daisuke Nishimura
2009-09-17 4:12 ` KAMEZAWA Hiroyuki
2009-09-17 2:24 ` [PATCH 2/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
` (7 subsequent siblings)
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:24 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
There are some places calling both res_counter_uncharge() and css_put()
to cancel the charge and the refcnt we have got by mem_cgroup_tyr_charge().
This patch introduces mem_cgroup_cancel_charge() and call it in those places.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 35 ++++++++++++++---------------------
1 files changed, 14 insertions(+), 21 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2b98a6..00f3f97 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1370,6 +1370,17 @@ nomem:
return -ENOMEM;
}
+/* A helper function to cancel the charge and refcnt by try_charge */
+static inline void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+{
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ }
+ css_put(&mem->css);
+}
+
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1436,13 +1447,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE,
- NULL);
- }
- css_put(&mem->css);
+ mem_cgroup_cancel_charge(mem);
return;
}
@@ -1606,14 +1611,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
cancel:
put_page(page);
uncharge:
- /* drop extra refcnt by try_charge() */
- css_put(&parent->css);
- /* uncharge if move fails */
- if (!mem_cgroup_is_root(parent)) {
- res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
- }
+ mem_cgroup_cancel_charge(parent);
return ret;
}
@@ -1830,12 +1828,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
- }
- css_put(&mem->css);
+ mem_cgroup_cancel_charge(mem);
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 1/8] memcg: introduce mem_cgroup_cancel_charge()
2009-09-17 2:24 ` [PATCH 1/8] memcg: introduce mem_cgroup_cancel_charge() Daisuke Nishimura
@ 2009-09-17 4:12 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 4:12 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:24:00 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> There are some places calling both res_counter_uncharge() and css_put()
> to cancel the charge and the refcnt we have got by mem_cgroup_tyr_charge().
>
> This patch introduces mem_cgroup_cancel_charge() and call it in those places.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
> mm/memcontrol.c | 35 ++++++++++++++---------------------
> 1 files changed, 14 insertions(+), 21 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e2b98a6..00f3f97 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1370,6 +1370,17 @@ nomem:
> return -ENOMEM;
> }
>
> +/* A helper function to cancel the charge and refcnt by try_charge */
> +static inline void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
> +{
> + if (!mem_cgroup_is_root(mem)) {
> + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> + if (do_swap_account)
> + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
> + }
> + css_put(&mem->css);
> +}
> +
> /*
> * A helper function to get mem_cgroup from ID. must be called under
> * rcu_read_lock(). The caller must check css_is_removed() or some if
> @@ -1436,13 +1447,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> lock_page_cgroup(pc);
> if (unlikely(PageCgroupUsed(pc))) {
> unlock_page_cgroup(pc);
> - if (!mem_cgroup_is_root(mem)) {
> - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> - if (do_swap_account)
> - res_counter_uncharge(&mem->memsw, PAGE_SIZE,
> - NULL);
> - }
> - css_put(&mem->css);
> + mem_cgroup_cancel_charge(mem);
> return;
> }
>
> @@ -1606,14 +1611,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
> cancel:
> put_page(page);
> uncharge:
> - /* drop extra refcnt by try_charge() */
> - css_put(&parent->css);
> - /* uncharge if move fails */
> - if (!mem_cgroup_is_root(parent)) {
> - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
> - if (do_swap_account)
> - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
> - }
> + mem_cgroup_cancel_charge(parent);
> return ret;
> }
>
> @@ -1830,12 +1828,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
> return;
> if (!mem)
> return;
> - if (!mem_cgroup_is_root(mem)) {
> - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> - if (do_swap_account)
> - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
> - }
> - css_put(&mem->css);
> + mem_cgroup_cancel_charge(mem);
> }
>
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 2/8] memcg: cleanup mem_cgroup_move_parent()
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
2009-09-17 2:24 ` [PATCH 1/8] memcg: introduce mem_cgroup_cancel_charge() Daisuke Nishimura
@ 2009-09-17 2:24 ` Daisuke Nishimura
2009-09-17 4:15 ` KAMEZAWA Hiroyuki
2009-09-17 2:25 ` [PATCH 3/8] cgroup: introduce cancel_attach() Daisuke Nishimura
` (6 subsequent siblings)
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:24 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
mem_cgroup_move_parent() calls try_charge first and cancel_charge on failure.
IMHO, charge/uncharge(especially charge) is high cost operation, so we should
avoid it as far as possible.
This patch tries to delay try_charge in mem_cgroup_move_parent() by re-ordering
checks it does.
And this patch changes the return value of mem_cgroup_move_account() from int
to void. Callers should confirm all of conditions needed to move account.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/page_cgroup.h | 2 +
mm/memcontrol.c | 77 ++++++++++++++++--------------------------
2 files changed, 31 insertions(+), 48 deletions(-)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 7a3627e..321f037 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -57,6 +57,8 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
+TESTPCGFLAG(Locked, LOCK)
+
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)
CLEARPCGFLAG(Cache, CACHE)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00f3f97..8b2bbbb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1487,20 +1487,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- *
- * returns 0 at success,
- * returns -EBUSY when lock is busy or "pc" is unstable.
+ * - the pc is locked, used, and ->mem_cgroup points to @from.
*
* This function does "uncharge" from old cgroup but doesn't do "charge" to
* new cgroup. It should be done by a caller.
*/
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static void mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to)
{
- struct mem_cgroup_per_zone *from_mz, *to_mz;
- int nid, zid;
- int ret = -EBUSY;
struct page *page;
int cpu;
struct mem_cgroup_stat *stat;
@@ -1508,20 +1503,9 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
-
- nid = page_cgroup_nid(pc);
- zid = page_cgroup_zid(pc);
- from_mz = mem_cgroup_zoneinfo(from, nid, zid);
- to_mz = mem_cgroup_zoneinfo(to, nid, zid);
-
- if (!trylock_page_cgroup(pc))
- return ret;
-
- if (!PageCgroupUsed(pc))
- goto out;
-
- if (pc->mem_cgroup != from)
- goto out;
+ VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!PageCgroupUsed(pc));
+ VM_BUG_ON(pc->mem_cgroup != from);
if (!mem_cgroup_is_root(from))
res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
@@ -1550,16 +1534,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
css_get(&to->css);
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
- ret = 0;
-out:
- unlock_page_cgroup(pc);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and it's garanteed that
* "to" is never removed. So, we don't check rmdir status here.
*/
- return ret;
}
/*
@@ -1580,38 +1560,39 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (!pcg)
return -EINVAL;
+ ret = -EBUSY;
+ if (!get_page_unless_zero(page))
+ goto out;
+ if (isolate_lru_page(page))
+ goto put;
- parent = mem_cgroup_from_cont(pcg);
-
+ ret = -EINVAL;
+ lock_page_cgroup(pc);
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != child) /* early check */
+ goto unlock;
+ unlock_page_cgroup(pc);
+ parent = mem_cgroup_from_cont(pcg);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
if (ret || !parent)
- return ret;
-
- if (!get_page_unless_zero(page)) {
- ret = -EBUSY;
- goto uncharge;
- }
-
- ret = isolate_lru_page(page);
-
- if (ret)
- goto cancel;
+ goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent);
-
- putback_lru_page(page);
- if (!ret) {
- put_page(page);
+ lock_page_cgroup(pc);
+ if (likely(PageCgroupUsed(pc) && pc->mem_cgroup == child)) {
+ mem_cgroup_move_account(pc, child, parent);
/* drop extra refcnt by try_charge() */
css_put(&parent->css);
- return 0;
+ } else {
+ ret = -EINVAL;
+ mem_cgroup_cancel_charge(parent); /* does css_put */
}
-
-cancel:
+unlock:
+ unlock_page_cgroup(pc);
+put_back:
+ putback_lru_page(page);
+put:
put_page(page);
-uncharge:
- mem_cgroup_cancel_charge(parent);
+out:
return ret;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 2/8] memcg: cleanup mem_cgroup_move_parent()
2009-09-17 2:24 ` [PATCH 2/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
@ 2009-09-17 4:15 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 4:15 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:24:42 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> mem_cgroup_move_parent() calls try_charge first and cancel_charge on failure.
> IMHO, charge/uncharge(especially charge) is high cost operation, so we should
> avoid it as far as possible.
>
> This patch tries to delay try_charge in mem_cgroup_move_parent() by re-ordering
> checks it does.
> And this patch changes the return value of mem_cgroup_move_account() from int
> to void. Callers should confirm all of conditions needed to move account.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
> include/linux/page_cgroup.h | 2 +
> mm/memcontrol.c | 77 ++++++++++++++++--------------------------
> 2 files changed, 31 insertions(+), 48 deletions(-)
>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7a3627e..321f037 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -57,6 +57,8 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
> static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
> { return test_and_clear_bit(PCG_##lname, &pc->flags); }
>
> +TESTPCGFLAG(Locked, LOCK)
> +
> /* Cache flag is set only once (at allocation) */
> TESTPCGFLAG(Cache, CACHE)
> CLEARPCGFLAG(Cache, CACHE)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 00f3f97..8b2bbbb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1487,20 +1487,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> *
> * The caller must confirm following.
> * - page is not on LRU (isolate_page() is useful.)
> - *
> - * returns 0 at success,
> - * returns -EBUSY when lock is busy or "pc" is unstable.
> + * - the pc is locked, used, and ->mem_cgroup points to @from.
> *
> * This function does "uncharge" from old cgroup but doesn't do "charge" to
> * new cgroup. It should be done by a caller.
> */
>
> -static int mem_cgroup_move_account(struct page_cgroup *pc,
> +static void mem_cgroup_move_account(struct page_cgroup *pc,
> struct mem_cgroup *from, struct mem_cgroup *to)
> {
> - struct mem_cgroup_per_zone *from_mz, *to_mz;
> - int nid, zid;
> - int ret = -EBUSY;
> struct page *page;
> int cpu;
> struct mem_cgroup_stat *stat;
> @@ -1508,20 +1503,9 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>
> VM_BUG_ON(from == to);
> VM_BUG_ON(PageLRU(pc->page));
> -
> - nid = page_cgroup_nid(pc);
> - zid = page_cgroup_zid(pc);
> - from_mz = mem_cgroup_zoneinfo(from, nid, zid);
> - to_mz = mem_cgroup_zoneinfo(to, nid, zid);
> -
> - if (!trylock_page_cgroup(pc))
> - return ret;
> -
> - if (!PageCgroupUsed(pc))
> - goto out;
> -
> - if (pc->mem_cgroup != from)
> - goto out;
> + VM_BUG_ON(!PageCgroupLocked(pc));
> + VM_BUG_ON(!PageCgroupUsed(pc));
> + VM_BUG_ON(pc->mem_cgroup != from);
>
> if (!mem_cgroup_is_root(from))
> res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
> @@ -1550,16 +1534,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
> css_get(&to->css);
> pc->mem_cgroup = to;
> mem_cgroup_charge_statistics(to, pc, true);
> - ret = 0;
> -out:
> - unlock_page_cgroup(pc);
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> * can be under rmdir(). But in current implementation, caller of
> * this function is just force_empty() and it's garanteed that
> * "to" is never removed. So, we don't check rmdir status here.
> */
> - return ret;
> }
>
> /*
> @@ -1580,38 +1560,39 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
> if (!pcg)
> return -EINVAL;
>
> + ret = -EBUSY;
> + if (!get_page_unless_zero(page))
> + goto out;
> + if (isolate_lru_page(page))
> + goto put;
>
> - parent = mem_cgroup_from_cont(pcg);
> -
> + ret = -EINVAL;
> + lock_page_cgroup(pc);
> + if (!PageCgroupUsed(pc) || pc->mem_cgroup != child) /* early check */
> + goto unlock;
> + unlock_page_cgroup(pc);
>
> + parent = mem_cgroup_from_cont(pcg);
> ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
> if (ret || !parent)
> - return ret;
> -
> - if (!get_page_unless_zero(page)) {
> - ret = -EBUSY;
> - goto uncharge;
> - }
> -
> - ret = isolate_lru_page(page);
> -
> - if (ret)
> - goto cancel;
> + goto put_back;
>
> - ret = mem_cgroup_move_account(pc, child, parent);
> -
> - putback_lru_page(page);
> - if (!ret) {
> - put_page(page);
> + lock_page_cgroup(pc);
> + if (likely(PageCgroupUsed(pc) && pc->mem_cgroup == child)) {
> + mem_cgroup_move_account(pc, child, parent);
> /* drop extra refcnt by try_charge() */
> css_put(&parent->css);
> - return 0;
> + } else {
> + ret = -EINVAL;
> + mem_cgroup_cancel_charge(parent); /* does css_put */
> }
> -
> -cancel:
> +unlock:
> + unlock_page_cgroup(pc);
> +put_back:
> + putback_lru_page(page);
> +put:
> put_page(page);
> -uncharge:
> - mem_cgroup_cancel_charge(parent);
> +out:
> return ret;
> }
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 3/8] cgroup: introduce cancel_attach()
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
2009-09-17 2:24 ` [PATCH 1/8] memcg: introduce mem_cgroup_cancel_charge() Daisuke Nishimura
2009-09-17 2:24 ` [PATCH 2/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
@ 2009-09-17 2:25 ` Daisuke Nishimura
2009-09-17 2:26 ` [PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
` (5 subsequent siblings)
8 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:25 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch adds cancel_attach() operation to struct cgroup_subsys.
cancel_attach() can be used when can_attach() operation prepares something
for the subsys and it should be discarded if attach_task/proc fails afterwards.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/cgroup.h | 2 ++
kernel/cgroup.c | 37 +++++++++++++++++++++++++++++--------
2 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 642a47f..a08edbc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -429,6 +429,8 @@ struct cgroup_subsys {
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk, bool threadgroup);
+ void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *tsk,
bool threadgroup);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7da6004..f27f28f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1700,7 +1700,7 @@ void threadgroup_fork_unlock(struct sighand_struct *sighand)
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss, *fail = NULL;
struct cgroup *oldcgrp;
struct cgroupfs_root *root = cgrp->root;
@@ -1712,15 +1712,18 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, tsk, false);
- if (retval)
- return retval;
+ if (retval) {
+ fail = ss;
+ goto out;
+ }
}
}
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
if (retval)
- return retval;
+ goto out;
+ retval = 0;
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
@@ -1733,7 +1736,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiter(cgrp);
- return 0;
+out:
+ if (retval)
+ for_each_subsys(root, ss) {
+ if (ss == fail)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, tsk, false);
+ }
+ return retval;
}
/*
@@ -1813,7 +1824,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
{
int retval;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss, *fail = NULL;
struct cgroup *oldcgrp;
struct css_set *oldcg;
struct cgroupfs_root *root = cgrp->root;
@@ -1839,8 +1850,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, leader, true);
- if (retval)
- return retval;
+ if (retval) {
+ fail = ss;
+ goto out;
+ }
}
}
@@ -1978,6 +1991,14 @@ list_teardown:
put_css_set(cg_entry->cg);
kfree(cg_entry);
}
+out:
+ if (retval)
+ for_each_subsys(root, ss) {
+ if (ss == fail)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, tsk, true);
+ }
/* done! */
return retval;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* [PATCH 4/8] memcg: add interface to migrate charge
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (2 preceding siblings ...)
2009-09-17 2:25 ` [PATCH 3/8] cgroup: introduce cancel_attach() Daisuke Nishimura
@ 2009-09-17 2:26 ` Daisuke Nishimura
2009-09-17 4:20 ` KAMEZAWA Hiroyuki
2009-09-17 2:26 ` [PATCH 5/8] memcg: migrate charge of anon Daisuke Nishimura
` (4 subsequent siblings)
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:26 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch adds "memory.migrate_charge" file and handlers of it.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6d77c80..6466e3c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,6 +225,8 @@ struct mem_cgroup {
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
+ unsigned int migrate_charge;
+
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -2826,6 +2828,31 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return 0;
}
+enum migrate_charge_type {
+ NR_MIGRATE_CHARGE_TYPE,
+};
+
+static u64 mem_cgroup_migrate_charge_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_cont(cgrp)->migrate_charge;
+}
+
+static int mem_cgroup_migrate_charge_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ if (val >= (1 << NR_MIGRATE_CHARGE_TYPE))
+ return -EINVAL;
+
+ cgroup_lock();
+ mem->migrate_charge = val;
+ cgroup_unlock();
+
+ return 0;
+}
+
static struct cftype mem_cgroup_files[] = {
{
@@ -2875,6 +2902,11 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
+ {
+ .name = "migrate_charge",
+ .read_u64 = mem_cgroup_migrate_charge_read,
+ .write_u64 = mem_cgroup_migrate_charge_write,
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3115,6 +3147,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (parent)
mem->swappiness = get_swappiness(parent);
atomic_set(&mem->refcnt, 1);
+ mem->migrate_charge = 0;
return &mem->css;
free_out:
__mem_cgroup_free(mem);
@@ -3151,6 +3184,35 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret;
}
+static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
+ struct task_struct *p)
+{
+ return 0;
+}
+
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct task_struct *p,
+ bool threadgroup)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ if (mem->migrate_charge && thread_group_leader(p))
+ return mem_cgroup_can_migrate_charge(mem, p);
+ return 0;
+}
+
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct task_struct *p,
+ bool threadgroup)
+{
+}
+
+static void mem_cgroup_migrate_charge(void)
+{
+}
+
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
@@ -3158,10 +3220,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
bool threadgroup)
{
mutex_lock(&memcg_tasklist);
- /*
- * FIXME: It's better to move charges of this process from old
- * memcg to new memcg. But it's just on TODO-List now.
- */
+ mem_cgroup_migrate_charge();
mutex_unlock(&memcg_tasklist);
}
@@ -3172,6 +3231,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.early_init = 0,
.use_id = 1,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 4/8] memcg: add interface to migrate charge
2009-09-17 2:26 ` [PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
@ 2009-09-17 4:20 ` KAMEZAWA Hiroyuki
2009-09-17 4:40 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 4:20 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:26:02 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch adds "memory.migrate_charge" file and handlers of it.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 files changed, 65 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6d77c80..6466e3c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,6 +225,8 @@ struct mem_cgroup {
> /* set when res.limit == memsw.limit */
> bool memsw_is_minimum;
>
> + unsigned int migrate_charge;
> +
> /*
> * statistics. This must be placed at the end of memcg.
> */
> @@ -2826,6 +2828,31 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> return 0;
> }
>
> +enum migrate_charge_type {
> + NR_MIGRATE_CHARGE_TYPE,
> +};
> +
To be honest, I don't like this MIGRATE_CHARGE_TYPE.
Why is this necessary to be complicated rather than true/false here ?
Is there much variation of use-case ?
Thanks,
-Kame
> +static u64 mem_cgroup_migrate_charge_read(struct cgroup *cgrp,
> + struct cftype *cft)
> +{
> + return mem_cgroup_from_cont(cgrp)->migrate_charge;
> +}
> +
> +static int mem_cgroup_migrate_charge_write(struct cgroup *cgrp,
> + struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
> +
> + if (val >= (1 << NR_MIGRATE_CHARGE_TYPE))
> + return -EINVAL;
> +
> + cgroup_lock();
> + mem->migrate_charge = val;
> + cgroup_unlock();
> +
> + return 0;
> +}
> +
>
> static struct cftype mem_cgroup_files[] = {
> {
> @@ -2875,6 +2902,11 @@ static struct cftype mem_cgroup_files[] = {
> .read_u64 = mem_cgroup_swappiness_read,
> .write_u64 = mem_cgroup_swappiness_write,
> },
> + {
> + .name = "migrate_charge",
> + .read_u64 = mem_cgroup_migrate_charge_read,
> + .write_u64 = mem_cgroup_migrate_charge_write,
> + },
> };
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> @@ -3115,6 +3147,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> if (parent)
> mem->swappiness = get_swappiness(parent);
> atomic_set(&mem->refcnt, 1);
> + mem->migrate_charge = 0;
> return &mem->css;
> free_out:
> __mem_cgroup_free(mem);
> @@ -3151,6 +3184,35 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> return ret;
> }
>
> +static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> + struct task_struct *p)
> +{
> + return 0;
> +}
> +
> +static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> + struct cgroup *cont,
> + struct task_struct *p,
> + bool threadgroup)
> +{
> + struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +
> + if (mem->migrate_charge && thread_group_leader(p))
> + return mem_cgroup_can_migrate_charge(mem, p);
> + return 0;
> +}
> +
> +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> + struct cgroup *cont,
> + struct task_struct *p,
> + bool threadgroup)
> +{
> +}
> +
> +static void mem_cgroup_migrate_charge(void)
> +{
> +}
> +
> static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> struct cgroup *cont,
> struct cgroup *old_cont,
> @@ -3158,10 +3220,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> bool threadgroup)
> {
> mutex_lock(&memcg_tasklist);
> - /*
> - * FIXME: It's better to move charges of this process from old
> - * memcg to new memcg. But it's just on TODO-List now.
> - */
> + mem_cgroup_migrate_charge();
> mutex_unlock(&memcg_tasklist);
> }
>
> @@ -3172,6 +3231,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
> .pre_destroy = mem_cgroup_pre_destroy,
> .destroy = mem_cgroup_destroy,
> .populate = mem_cgroup_populate,
> + .can_attach = mem_cgroup_can_attach,
> + .cancel_attach = mem_cgroup_cancel_attach,
> .attach = mem_cgroup_move_task,
> .early_init = 0,
> .use_id = 1,
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 4/8] memcg: add interface to migrate charge
2009-09-17 4:20 ` KAMEZAWA Hiroyuki
@ 2009-09-17 4:40 ` Daisuke Nishimura
0 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 4:40 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan, Daisuke Nishimura
On Thu, 17 Sep 2009 13:20:07 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 17 Sep 2009 11:26:02 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch adds "memory.migrate_charge" file and handlers of it.
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> > mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> > 1 files changed, 65 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 6d77c80..6466e3c 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -225,6 +225,8 @@ struct mem_cgroup {
> > /* set when res.limit == memsw.limit */
> > bool memsw_is_minimum;
> >
> > + unsigned int migrate_charge;
> > +
> > /*
> > * statistics. This must be placed at the end of memcg.
> > */
> > @@ -2826,6 +2828,31 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> > return 0;
> > }
> >
> > +enum migrate_charge_type {
> > + NR_MIGRATE_CHARGE_TYPE,
> > +};
> > +
>
> To be honest, I don't like this MIGRATE_CHARGE_TYPE.
> Why is this necessary to be complicated rather than true/false here ?
> Is there much variation of use-case ?
>
hmm, I introduced it just to implement and test this feature step by step.
But considering more, I think you're right. It just complicate the code.
I'll change it bool. It will make the code simpler.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 5/8] memcg: migrate charge of anon
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (3 preceding siblings ...)
2009-09-17 2:26 ` [PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
@ 2009-09-17 2:26 ` Daisuke Nishimura
2009-09-17 4:57 ` KAMEZAWA Hiroyuki
2009-09-17 2:27 ` [PATCH 6/8] memcg: migrate charge of shmem Daisuke Nishimura
` (3 subsequent siblings)
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:26 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch is the core part of this charge migration feature.
It adds functions to migrate charge of anonymous pages of the task.
Implementation:
- define struct migrate_charge and a valuable of it(mc) to remember
the target pages and other information.
- At can_attach(), isolate the target pages, call __mem_cgroup_try_charge(),
and move them to mc->list.
- Call mem_cgroup_move_account() at attach() about all pages on mc->list
after necessary checks under page_cgroup lock, and put back them to LRU.
- Cancel charges about all pages remains on mc->list on failure or at the end
of charge migration, and put back them to LRU.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 195 insertions(+), 1 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a6b07f8..3a3f4ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,8 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -274,6 +276,18 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+/*
+ * Stuffs for migrating charge at task move.
+ * mc and its members are protected by cgroup_lock
+ */
+struct migrate_charge {
+ struct task_struct *tsk;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ struct list_head list;
+};
+static struct migrate_charge *mc;
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
@@ -2829,6 +2843,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
}
enum migrate_charge_type {
+ MIGRATE_CHARGE_ANON,
NR_MIGRATE_CHARGE_TYPE,
};
@@ -3184,10 +3199,164 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret;
}
+static int migrate_charge_prepare_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ struct page *page, *tmp;
+ LIST_HEAD(list);
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte, ptent;
+ spinlock_t *ptl;
+ bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
+
+ lru_add_drain_all();
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ struct page_cgroup *pc;
+
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageAnon(page) && move_anon)
+ ;
+ else
+ continue;
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != mc->from) {
+ unlock_page_cgroup(pc);
+ continue;
+ }
+ unlock_page_cgroup(pc);
+
+ if (!get_page_unless_zero(page))
+ continue;
+
+ if (!isolate_lru_page(page))
+ list_add_tail(&page->lru, &list);
+ else
+ put_page(page);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (!list_empty(&list))
+ list_for_each_entry_safe(page, tmp, &list, lru) {
+ struct mem_cgroup *mem = mc->to;
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
+ false, page);
+ if (ret || !mem)
+ break;
+ list_move_tail(&page->lru, &mc->list);
+ cond_resched();
+ }
+
+ /*
+ * We should put back all pages which remain on "list".
+ * This means try_charge above has failed.
+ * Pages which have been moved to mc->list would be put back at
+ * clear_migrate_charge.
+ */
+ if (!list_empty(&list))
+ list_for_each_entry_safe(page, tmp, &list, lru) {
+ list_del(&page->lru);
+ putback_lru_page(page);
+ put_page(page);
+ }
+
+ return ret;
+}
+
+static int migrate_charge_prepare(void)
+{
+ int ret = 0;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ mm = get_task_mm(mc->tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct mm_walk migrate_charge_walk = {
+ .pmd_entry = migrate_charge_prepare_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ /* We migrate charge of private pages for now */
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
+ continue;
+ if (mc->to->migrate_charge) {
+ ret = walk_page_range(vma->vm_start, vma->vm_end,
+ &migrate_charge_walk);
+ if (ret)
+ break;
+ }
+ }
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+ return ret;
+}
+
+static void mem_cgroup_clear_migrate_charge(void)
+{
+ struct page *page, *tmp;
+
+ VM_BUG_ON(!mc);
+
+ if (!list_empty(&mc->list))
+ list_for_each_entry_safe(page, tmp, &mc->list, lru) {
+ mem_cgroup_cancel_charge(mc->to);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ put_page(page);
+ }
+
+ kfree(mc);
+ mc = NULL;
+}
+
static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
struct task_struct *p)
{
- return 0;
+ int ret;
+ struct mem_cgroup *from = mem_cgroup_from_task(p);
+
+ VM_BUG_ON(mc);
+
+ if (from == mem)
+ return 0;
+
+ mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ mc->tsk = p;
+ mc->from = from;
+ mc->to = mem;
+ INIT_LIST_HEAD(&mc->list);
+
+ ret = migrate_charge_prepare();
+
+ if (ret)
+ mem_cgroup_clear_migrate_charge();
+ return ret;
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -3207,10 +3376,35 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
+ if (mc)
+ mem_cgroup_clear_migrate_charge();
}
static void mem_cgroup_migrate_charge(void)
{
+ struct page *page, *tmp;
+ struct page_cgroup *pc;
+
+ if (!mc)
+ return;
+
+ if (!list_empty(&mc->list))
+ list_for_each_entry_safe(page, tmp, &mc->list, lru) {
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
+ mem_cgroup_move_account(pc, mc->from, mc->to);
+ /* drop extra refcnt by try_charge() */
+ css_put(&mc->to->css);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ put_page(page);
+ }
+ unlock_page_cgroup(pc);
+ cond_resched();
+ }
+
+ mem_cgroup_clear_migrate_charge();
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 5/8] memcg: migrate charge of anon
2009-09-17 2:26 ` [PATCH 5/8] memcg: migrate charge of anon Daisuke Nishimura
@ 2009-09-17 4:57 ` KAMEZAWA Hiroyuki
2009-09-17 5:56 ` Daisuke Nishimura
2009-09-17 23:52 ` KOSAKI Motohiro
0 siblings, 2 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 4:57 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:26:56 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch is the core part of this charge migration feature.
> It adds functions to migrate charge of anonymous pages of the task.
>
> Implementation:
> - define struct migrate_charge and a valuable of it(mc) to remember
> the target pages and other information.
> - At can_attach(), isolate the target pages, call __mem_cgroup_try_charge(),
> and move them to mc->list.
> - Call mem_cgroup_move_account() at attach() about all pages on mc->list
> after necessary checks under page_cgroup lock, and put back them to LRU.
> - Cancel charges about all pages remains on mc->list on failure or at the end
> of charge migration, and put back them to LRU.
>
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> mm/memcontrol.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 195 insertions(+), 1 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a6b07f8..3a3f4ac 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -21,6 +21,8 @@
> #include <linux/memcontrol.h>
> #include <linux/cgroup.h>
> #include <linux/mm.h>
> +#include <linux/migrate.h>
> +#include <linux/hugetlb.h>
> #include <linux/pagemap.h>
> #include <linux/smp.h>
> #include <linux/page-flags.h>
> @@ -274,6 +276,18 @@ enum charge_type {
> #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
>
> +/*
> + * Stuffs for migrating charge at task move.
> + * mc and its members are protected by cgroup_lock
> + */
> +struct migrate_charge {
> + struct task_struct *tsk;
> + struct mem_cgroup *from;
> + struct mem_cgroup *to;
> + struct list_head list;
> +};
> +static struct migrate_charge *mc;
> +
> static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> @@ -2829,6 +2843,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> }
>
> enum migrate_charge_type {
> + MIGRATE_CHARGE_ANON,
> NR_MIGRATE_CHARGE_TYPE,
> };
>
> @@ -3184,10 +3199,164 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> return ret;
> }
>
> +static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + int ret = 0;
> + struct page *page, *tmp;
> + LIST_HEAD(list);
> + struct vm_area_struct *vma = walk->private;
> + pte_t *pte, ptent;
> + spinlock_t *ptl;
> + bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
> +
> + lru_add_drain_all();
plz call lru_add_drain_all() before taking mmap_sem().
This waits for workqueue in synchronous manner.
(I think KOSAKI-san is working for better pagevec drain function.)
> + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> + for (; addr != end; pte++, addr += PAGE_SIZE) {
> + struct page_cgroup *pc;
> +
> + ptent = *pte;
> + if (!pte_present(ptent))
> + continue;
> +
> + page = vm_normal_page(vma, addr, ptent);
> + if (!page)
> + continue;
plz check
if (!page || !page_mapped(page))
continue;
> +
> + if (PageAnon(page) && move_anon)
> + ;
> + else
> + continue;
Bad if ;)
> +
> + pc = lookup_page_cgroup(page);
> + lock_page_cgroup(pc);
> + if (!PageCgroupUsed(pc) || pc->mem_cgroup != mc->from) {
> + unlock_page_cgroup(pc);
> + continue;
> + }
> + unlock_page_cgroup(pc);
> +
> + if (!get_page_unless_zero(page))
> + continue;
> +
> + if (!isolate_lru_page(page))
> + list_add_tail(&page->lru, &list);
> + else
> + put_page(page);
> + }
> + pte_unmap_unlock(pte - 1, ptl);
> + cond_resched();
> +
> + if (!list_empty(&list))
> + list_for_each_entry_safe(page, tmp, &list, lru) {
> + struct mem_cgroup *mem = mc->to;
> + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
> + false, page);
> + if (ret || !mem)
> + break;
> + list_move_tail(&page->lru, &mc->list);
> + cond_resched();
need cond_resched() here ?
> + }
> +
> + /*
> + * We should put back all pages which remain on "list".
> + * This means try_charge above has failed.
> + * Pages which have been moved to mc->list would be put back at
> + * clear_migrate_charge.
> + */
> + if (!list_empty(&list))
> + list_for_each_entry_safe(page, tmp, &list, lru) {
> + list_del(&page->lru);
> + putback_lru_page(page);
> + put_page(page);
I wonder this put_page() is not necessary.
> + }
> +
> + return ret;
> +}
> +
> +static int migrate_charge_prepare(void)
> +{
> + int ret = 0;
> + struct mm_struct *mm;
> + struct vm_area_struct *vma;
> +
> + mm = get_task_mm(mc->tsk);
> + if (!mm)
> + return 0;
> +
> + down_read(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + struct mm_walk migrate_charge_walk = {
> + .pmd_entry = migrate_charge_prepare_pte_range,
> + .mm = mm,
> + .private = vma,
> + };
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + break;
> + }
> + if (is_vm_hugetlb_page(vma))
> + continue;
> + /* We migrate charge of private pages for now */
> + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
> + continue;
> + if (mc->to->migrate_charge) {
> + ret = walk_page_range(vma->vm_start, vma->vm_end,
> + &migrate_charge_walk);
> + if (ret)
> + break;
> + }
> + }
> + up_read(&mm->mmap_sem);
> +
> + mmput(mm);
Hmm, Does this means a thread which is moved can continue its work and
newly allocated pages will remain in old group ?
> + return ret;
> +}
> +
> +static void mem_cgroup_clear_migrate_charge(void)
> +{
> + struct page *page, *tmp;
> +
> + VM_BUG_ON(!mc);
> +
> + if (!list_empty(&mc->list))
I think list_for_each_entry_safe() handles empty case.
> + list_for_each_entry_safe(page, tmp, &mc->list, lru) {
> + mem_cgroup_cancel_charge(mc->to);
> + list_del(&page->lru);
> + putback_lru_page(page);
> + put_page(page);
> + }
> +
> + kfree(mc);
> + mc = NULL;
> +}
> +
> static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> struct task_struct *p)
> {
> - return 0;
> + int ret;
> + struct mem_cgroup *from = mem_cgroup_from_task(p);
> +
> + VM_BUG_ON(mc);
> +
> + if (from == mem)
> + return 0;
> +
> + mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
> + if (!mc)
> + return -ENOMEM;
> +
> + mc->tsk = p;
> + mc->from = from;
> + mc->to = mem;
> + INIT_LIST_HEAD(&mc->list);
> +
> + ret = migrate_charge_prepare();
> +
> + if (ret)
> + mem_cgroup_clear_migrate_charge();
> + return ret;
> }
>
> static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> @@ -3207,10 +3376,35 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> struct task_struct *p,
> bool threadgroup)
> {
> + if (mc)
> + mem_cgroup_clear_migrate_charge();
> }
>
> static void mem_cgroup_migrate_charge(void)
> {
> + struct page *page, *tmp;
> + struct page_cgroup *pc;
> +
> + if (!mc)
> + return;
> +
> + if (!list_empty(&mc->list))
> + list_for_each_entry_safe(page, tmp, &mc->list, lru) {
> + pc = lookup_page_cgroup(page);
> + lock_page_cgroup(pc);
> + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
> + mem_cgroup_move_account(pc, mc->from, mc->to);
> + /* drop extra refcnt by try_charge() */
> + css_put(&mc->to->css);
> + list_del(&page->lru);
> + putback_lru_page(page);
> + put_page(page);
> + }
> + unlock_page_cgroup(pc);
> + cond_resched();
> + }
> +
> + mem_cgroup_clear_migrate_charge();
> }
>
Okay, them, if other subsystem fails "can_attach()", migrated charges are not
moved back to original group. Right ?
My biggest concern in this implementation is this "isolate" pages too much.
Could you modify the whole routine as...
struct migrate_charge {
struct task_struct *tsk;
struct mem_cgroup *from;
struct mem_cgroup *to;
struct list_head list;
long charged;
long committed;
};
- mem_cgroup_can_migrate_charge() ....
count pages and do "charge", no page isolation.
and remember the number of charges as mc->charged++;
- mem_cgroup_migrate_charge()
- scan vmas/page table again. And isolate pages in fine grain.
(256 pages per scan..or some small number)
- migrate pages if success mc->committed++
after move, uncharge (mc->charged - mc->commited)
Maybe you can find better one. But isolating all pages of process at once is a
big hammber for vm, OOM.
Thanks,
-Kame
> static void mem_cgroup_move_task(struct cgroup_subsys *ss,
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 5/8] memcg: migrate charge of anon
2009-09-17 4:57 ` KAMEZAWA Hiroyuki
@ 2009-09-17 5:56 ` Daisuke Nishimura
2009-09-17 6:25 ` KAMEZAWA Hiroyuki
2009-09-17 23:52 ` KOSAKI Motohiro
1 sibling, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 5:56 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan, Daisuke Nishimura
On Thu, 17 Sep 2009 13:57:37 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 17 Sep 2009 11:26:56 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch is the core part of this charge migration feature.
> > It adds functions to migrate charge of anonymous pages of the task.
> >
> > Implementation:
> > - define struct migrate_charge and a valuable of it(mc) to remember
> > the target pages and other information.
> > - At can_attach(), isolate the target pages, call __mem_cgroup_try_charge(),
> > and move them to mc->list.
> > - Call mem_cgroup_move_account() at attach() about all pages on mc->list
> > after necessary checks under page_cgroup lock, and put back them to LRU.
> > - Cancel charges about all pages remains on mc->list on failure or at the end
> > of charge migration, and put back them to LRU.
> >
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
> > ---
> > mm/memcontrol.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 files changed, 195 insertions(+), 1 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index a6b07f8..3a3f4ac 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -21,6 +21,8 @@
> > #include <linux/memcontrol.h>
> > #include <linux/cgroup.h>
> > #include <linux/mm.h>
> > +#include <linux/migrate.h>
> > +#include <linux/hugetlb.h>
> > #include <linux/pagemap.h>
> > #include <linux/smp.h>
> > #include <linux/page-flags.h>
> > @@ -274,6 +276,18 @@ enum charge_type {
> > #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> > #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
> >
> > +/*
> > + * Stuffs for migrating charge at task move.
> > + * mc and its members are protected by cgroup_lock
> > + */
> > +struct migrate_charge {
> > + struct task_struct *tsk;
> > + struct mem_cgroup *from;
> > + struct mem_cgroup *to;
> > + struct list_head list;
> > +};
> > +static struct migrate_charge *mc;
> > +
> > static void mem_cgroup_get(struct mem_cgroup *mem);
> > static void mem_cgroup_put(struct mem_cgroup *mem);
> > static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> > @@ -2829,6 +2843,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> > }
> >
> > enum migrate_charge_type {
> > + MIGRATE_CHARGE_ANON,
> > NR_MIGRATE_CHARGE_TYPE,
> > };
> >
> > @@ -3184,10 +3199,164 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> > return ret;
> > }
> >
> > +static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> > + unsigned long addr, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + int ret = 0;
> > + struct page *page, *tmp;
> > + LIST_HEAD(list);
> > + struct vm_area_struct *vma = walk->private;
> > + pte_t *pte, ptent;
> > + spinlock_t *ptl;
> > + bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
> > +
> > + lru_add_drain_all();
>
> plz call lru_add_drain_all() before taking mmap_sem().
> This waits for workqueue in synchronous manner.
> (I think KOSAKI-san is working for better pagevec drain function.)
>
O.K.
I'll move it.
>
> > + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > + for (; addr != end; pte++, addr += PAGE_SIZE) {
> > + struct page_cgroup *pc;
> > +
> > + ptent = *pte;
> > + if (!pte_present(ptent))
> > + continue;
> > +
> > + page = vm_normal_page(vma, addr, ptent);
> > + if (!page)
> > + continue;
>
> plz check
> if (!page || !page_mapped(page))
> continue;
>
will fix.
> > +
> > + if (PageAnon(page) && move_anon)
> > + ;
> > + else
> > + continue;
> Bad if ;)
>
>
O.K.
This check isn't needed if we migrate charge including shmem/tmpfs and file cache.
> > +
> > + pc = lookup_page_cgroup(page);
> > + lock_page_cgroup(pc);
> > + if (!PageCgroupUsed(pc) || pc->mem_cgroup != mc->from) {
> > + unlock_page_cgroup(pc);
> > + continue;
> > + }
> > + unlock_page_cgroup(pc);
>
>
> > +
> > + if (!get_page_unless_zero(page))
> > + continue;
> > +
> > + if (!isolate_lru_page(page))
> > + list_add_tail(&page->lru, &list);
> > + else
> > + put_page(page);
> > + }
> > + pte_unmap_unlock(pte - 1, ptl);
> > + cond_resched();
> > +
> > + if (!list_empty(&list))
> > + list_for_each_entry_safe(page, tmp, &list, lru) {
> > + struct mem_cgroup *mem = mc->to;
> > + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
> > + false, page);
> > + if (ret || !mem)
> > + break;
> > + list_move_tail(&page->lru, &mc->list);
> > + cond_resched();
>
> need cond_resched() here ?
>
Ah, __mem_cgroup_try_charge itself can sleep if needed.
I'll remove it.
> > + }
> > +
> > + /*
> > + * We should put back all pages which remain on "list".
> > + * This means try_charge above has failed.
> > + * Pages which have been moved to mc->list would be put back at
> > + * clear_migrate_charge.
> > + */
> > + if (!list_empty(&list))
> > + list_for_each_entry_safe(page, tmp, &list, lru) {
> > + list_del(&page->lru);
> > + putback_lru_page(page);
> > + put_page(page);
>
> I wonder this put_page() is not necessary.
>
get_page_unless_zero(page) is called above.
putback_lru_page() will only decrease the refcount got by isolate_lru_page().
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int migrate_charge_prepare(void)
> > +{
> > + int ret = 0;
> > + struct mm_struct *mm;
> > + struct vm_area_struct *vma;
> > +
> > + mm = get_task_mm(mc->tsk);
> > + if (!mm)
> > + return 0;
> > +
> > + down_read(&mm->mmap_sem);
> > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > + struct mm_walk migrate_charge_walk = {
> > + .pmd_entry = migrate_charge_prepare_pte_range,
> > + .mm = mm,
> > + .private = vma,
> > + };
> > + if (signal_pending(current)) {
> > + ret = -EINTR;
> > + break;
> > + }
> > + if (is_vm_hugetlb_page(vma))
> > + continue;
> > + /* We migrate charge of private pages for now */
> > + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
> > + continue;
> > + if (mc->to->migrate_charge) {
> > + ret = walk_page_range(vma->vm_start, vma->vm_end,
> > + &migrate_charge_walk);
> > + if (ret)
> > + break;
> > + }
> > + }
> > + up_read(&mm->mmap_sem);
> > +
> > + mmput(mm);
>
> Hmm, Does this means a thread which is moved can continue its work and
> newly allocated pages will remain in old group ?
>
Yes.
Pages allocated between can_attach() and cgroup_task_migrate() will be
charged to old group.
But, IIUC, we should release mmap_sem because attach() of cpuset tries to down_write
mmap_sem(update_tasks_nodemask -> cpuset_change_nodemask -> mpol_rebind_mm).
>
> > + return ret;
> > +}
> > +
> > +static void mem_cgroup_clear_migrate_charge(void)
> > +{
> > + struct page *page, *tmp;
> > +
> > + VM_BUG_ON(!mc);
> > +
> > + if (!list_empty(&mc->list))
>
> I think list_for_each_entry_safe() handles empty case.
>
#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))
hmm, "pos = list_entry((head)->next, typeof(*pos), member)" points to proper pointer
in list_empty(i.e. head->next == head) case ?
"mc->list" is struct list_head, not struct page.
>
> > + list_for_each_entry_safe(page, tmp, &mc->list, lru) {
> > + mem_cgroup_cancel_charge(mc->to);
> > + list_del(&page->lru);
> > + putback_lru_page(page);
> > + put_page(page);
> > + }
> > +
> > + kfree(mc);
> > + mc = NULL;
> > +}
> > +
> > static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> > struct task_struct *p)
> > {
> > - return 0;
> > + int ret;
> > + struct mem_cgroup *from = mem_cgroup_from_task(p);
> > +
> > + VM_BUG_ON(mc);
> > +
> > + if (from == mem)
> > + return 0;
> > +
> > + mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
> > + if (!mc)
> > + return -ENOMEM;
> > +
> > + mc->tsk = p;
> > + mc->from = from;
> > + mc->to = mem;
> > + INIT_LIST_HEAD(&mc->list);
> > +
> > + ret = migrate_charge_prepare();
> > +
> > + if (ret)
> > + mem_cgroup_clear_migrate_charge();
> > + return ret;
> > }
> >
> > static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> > @@ -3207,10 +3376,35 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> > struct task_struct *p,
> > bool threadgroup)
> > {
> > + if (mc)
> > + mem_cgroup_clear_migrate_charge();
> > }
> >
> > static void mem_cgroup_migrate_charge(void)
> > {
> > + struct page *page, *tmp;
> > + struct page_cgroup *pc;
> > +
> > + if (!mc)
> > + return;
> > +
> > + if (!list_empty(&mc->list))
> > + list_for_each_entry_safe(page, tmp, &mc->list, lru) {
> > + pc = lookup_page_cgroup(page);
> > + lock_page_cgroup(pc);
> > + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
> > + mem_cgroup_move_account(pc, mc->from, mc->to);
> > + /* drop extra refcnt by try_charge() */
> > + css_put(&mc->to->css);
> > + list_del(&page->lru);
> > + putback_lru_page(page);
> > + put_page(page);
> > + }
> > + unlock_page_cgroup(pc);
> > + cond_resched();
> > + }
> > +
> > + mem_cgroup_clear_migrate_charge();
> > }
> >
>
> Okay, them, if other subsystem fails "can_attach()", migrated charges are not
> moved back to original group. Right ?
>
We haven't migrated charges at can_attach() stage. It only does try_charge to
the new group.
If can_attach() of other subsystem fails, we only cancel the result of try_charge.
>
>
> My biggest concern in this implementation is this "isolate" pages too much.
> Could you modify the whole routine as...
>
> struct migrate_charge {
> struct task_struct *tsk;
> struct mem_cgroup *from;
> struct mem_cgroup *to;
> struct list_head list;
> long charged;
> long committed;
> };
> - mem_cgroup_can_migrate_charge() ....
> count pages and do "charge", no page isolation.
> and remember the number of charges as mc->charged++;
> - mem_cgroup_migrate_charge()
> - scan vmas/page table again. And isolate pages in fine grain.
> (256 pages per scan..or some small number)
> - migrate pages if success mc->committed++
>
> after move, uncharge (mc->charged - mc->commited)
>
hmm, I selected current implementation just to prevent parsing page table twice.
But if you prefer the parse-again direction, I'll try it.
It will fix most of the problems that current implementation has.
Thanks,
Daisuke Nishimura.
> Maybe you can find better one. But isolating all pages of process at once is a
> big hammber for vm, OOM.
>
> Thanks,
> -Kame
>
>
>
> > static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> >
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 5/8] memcg: migrate charge of anon
2009-09-17 5:56 ` Daisuke Nishimura
@ 2009-09-17 6:25 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 6:25 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 14:56:57 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > + /*
> > > + * We should put back all pages which remain on "list".
> > > + * This means try_charge above has failed.
> > > + * Pages which have been moved to mc->list would be put back at
> > > + * clear_migrate_charge.
> > > + */
> > > + if (!list_empty(&list))
> > > + list_for_each_entry_safe(page, tmp, &list, lru) {
> > > + list_del(&page->lru);
> > > + putback_lru_page(page);
> > > + put_page(page);
> >
> > I wonder this put_page() is not necessary.
> >
> get_page_unless_zero(page) is called above.
> putback_lru_page() will only decrease the refcount got by isolate_lru_page().
>
ok. I misunderstood.
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static int migrate_charge_prepare(void)
> > > +{
> > > + int ret = 0;
> > > + struct mm_struct *mm;
> > > + struct vm_area_struct *vma;
> > > +
> > > + mm = get_task_mm(mc->tsk);
> > > + if (!mm)
> > > + return 0;
> > > +
> > > + down_read(&mm->mmap_sem);
> > > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > > + struct mm_walk migrate_charge_walk = {
> > > + .pmd_entry = migrate_charge_prepare_pte_range,
> > > + .mm = mm,
> > > + .private = vma,
> > > + };
> > > + if (signal_pending(current)) {
> > > + ret = -EINTR;
> > > + break;
> > > + }
> > > + if (is_vm_hugetlb_page(vma))
> > > + continue;
> > > + /* We migrate charge of private pages for now */
> > > + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
> > > + continue;
> > > + if (mc->to->migrate_charge) {
> > > + ret = walk_page_range(vma->vm_start, vma->vm_end,
> > > + &migrate_charge_walk);
> > > + if (ret)
> > > + break;
> > > + }
> > > + }
> > > + up_read(&mm->mmap_sem);
> > > +
> > > + mmput(mm);
> >
> > Hmm, Does this means a thread which is moved can continue its work and
> > newly allocated pages will remain in old group ?
> >
> Yes.
> Pages allocated between can_attach() and cgroup_task_migrate() will be
> charged to old group.
> But, IIUC, we should release mmap_sem because attach() of cpuset tries to down_write
> mmap_sem(update_tasks_nodemask -> cpuset_change_nodemask -> mpol_rebind_mm).
>
agreed.
> >
> > > + return ret;
> > > +}
> > > +
> > > +static void mem_cgroup_clear_migrate_charge(void)
> > > +{
> > > + struct page *page, *tmp;
> > > +
> > > + VM_BUG_ON(!mc);
> > > +
> > > + if (!list_empty(&mc->list))
> >
> > I think list_for_each_entry_safe() handles empty case.
> >
>
> #define list_for_each_entry_safe(pos, n, head, member) \
> for (pos = list_entry((head)->next, typeof(*pos), member), \
> n = list_entry(pos->member.next, typeof(*pos), member); \
> &pos->member != (head); \
> pos = n, n = list_entry(n->member.next, typeof(*n), member))
>
> hmm, "pos = list_entry((head)->next, typeof(*pos), member)" points to proper pointer
> in list_empty(i.e. head->next == head) case ?
I think so.
> "mc->list" is struct list_head, not struct page.
>
yes.
pos = list_entry((mc->list)->next, struct page, lru);
here, mc->list->next == &mc->list if empty.
Then, pos is "struct page" but points to &mc->list - some bytes.
Then, pos->member == &mc->list == head.
But this point is a nitpick. plz do as you want.
if (!list_empty(mc->list)) show us mc->list can be empty.
> > > + mem_cgroup_clear_migrate_charge();
> > > }
> > >
> >
> > Okay, them, if other subsystem fails "can_attach()", migrated charges are not
> > moved back to original group. Right ?
> >
> We haven't migrated charges at can_attach() stage. It only does try_charge to
> the new group.
> If can_attach() of other subsystem fails, we only cancel the result of try_charge.
>
> >
> >
> > My biggest concern in this implementation is this "isolate" pages too much.
> > Could you modify the whole routine as...
> >
> > struct migrate_charge {
> > struct task_struct *tsk;
> > struct mem_cgroup *from;
> > struct mem_cgroup *to;
> > struct list_head list;
> > long charged;
> > long committed;
> > };
> > - mem_cgroup_can_migrate_charge() ....
> > count pages and do "charge", no page isolation.
> > and remember the number of charges as mc->charged++;
> > - mem_cgroup_migrate_charge()
> > - scan vmas/page table again. And isolate pages in fine grain.
> > (256 pages per scan..or some small number)
> > - migrate pages if success mc->committed++
> >
> > after move, uncharge (mc->charged - mc->commited)
> >
> hmm, I selected current implementation just to prevent parsing page table twice.
> But if you prefer the parse-again direction, I'll try it.
> It will fix most of the problems that current implementation has.
>
What I afraid of is a corner case, migrating "very big" process, and
INACITVE/ACTIVE lru of a zone goes down to be nearly empty, OOM.
If you look into other callers of isolate_lru_page(), the number of
pages isolated per iteration is limited to some value.
(Almost all callers use array for limiting/batching.)
Thanks,
-Kame
>
> Thanks,
> Daisuke Nishimura.
>
> > Maybe you can find better one. But isolating all pages of process at once is a
> > big hammber for vm, OOM.
> >
> > Thanks,
> > -Kame
> >
> >
> >
> > > static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> > >
> >
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH 5/8] memcg: migrate charge of anon
2009-09-17 4:57 ` KAMEZAWA Hiroyuki
2009-09-17 5:56 ` Daisuke Nishimura
@ 2009-09-17 23:52 ` KOSAKI Motohiro
1 sibling, 0 replies; 49+ messages in thread
From: KOSAKI Motohiro @ 2009-09-17 23:52 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: kosaki.motohiro, Daisuke Nishimura, linux-mm, Balbir Singh,
Paul Menage, Li Zefan
> On Thu, 17 Sep 2009 11:26:56 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch is the core part of this charge migration feature.
> > It adds functions to migrate charge of anonymous pages of the task.
> >
> > Implementation:
> > - define struct migrate_charge and a valuable of it(mc) to remember
> > the target pages and other information.
> > - At can_attach(), isolate the target pages, call __mem_cgroup_try_charge(),
> > and move them to mc->list.
> > - Call mem_cgroup_move_account() at attach() about all pages on mc->list
> > after necessary checks under page_cgroup lock, and put back them to LRU.
> > - Cancel charges about all pages remains on mc->list on failure or at the end
> > of charge migration, and put back them to LRU.
> >
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
> > ---
> > mm/memcontrol.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 files changed, 195 insertions(+), 1 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index a6b07f8..3a3f4ac 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -21,6 +21,8 @@
> > #include <linux/memcontrol.h>
> > #include <linux/cgroup.h>
> > #include <linux/mm.h>
> > +#include <linux/migrate.h>
> > +#include <linux/hugetlb.h>
> > #include <linux/pagemap.h>
> > #include <linux/smp.h>
> > #include <linux/page-flags.h>
> > @@ -274,6 +276,18 @@ enum charge_type {
> > #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> > #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
> >
> > +/*
> > + * Stuffs for migrating charge at task move.
> > + * mc and its members are protected by cgroup_lock
> > + */
> > +struct migrate_charge {
> > + struct task_struct *tsk;
> > + struct mem_cgroup *from;
> > + struct mem_cgroup *to;
> > + struct list_head list;
> > +};
> > +static struct migrate_charge *mc;
> > +
> > static void mem_cgroup_get(struct mem_cgroup *mem);
> > static void mem_cgroup_put(struct mem_cgroup *mem);
> > static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> > @@ -2829,6 +2843,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> > }
> >
> > enum migrate_charge_type {
> > + MIGRATE_CHARGE_ANON,
> > NR_MIGRATE_CHARGE_TYPE,
> > };
> >
> > @@ -3184,10 +3199,164 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> > return ret;
> > }
> >
> > +static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> > + unsigned long addr, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + int ret = 0;
> > + struct page *page, *tmp;
> > + LIST_HEAD(list);
> > + struct vm_area_struct *vma = walk->private;
> > + pte_t *pte, ptent;
> > + spinlock_t *ptl;
> > + bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
> > +
> > + lru_add_drain_all();
>
> plz call lru_add_drain_all() before taking mmap_sem().
> This waits for workqueue in synchronous manner.
> (I think KOSAKI-san is working for better pagevec drain function.)
FYI
I plan to submit following patch after merge window.
lru_add_drain_all_async() can be called in mmap_sem grabbed area.
==================================================
From d76f56718886b6dd7f77babddad45a33ccf668cd Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 17 Sep 2009 16:07:55 +0900
Subject: [PATCH 1/2] Implement lru_add_drain_all_async()
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
include/linux/swap.h | 1 +
mm/swap.c | 24 ++++++++++++++++++++++++
2 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4ec9001..1f5772a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -204,6 +204,7 @@ extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
extern int lru_add_drain_all(void);
+extern int lru_add_drain_all_async(void);
extern void rotate_reclaimable_page(struct page *page);
extern void swap_setup(void);
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d..e16cd40 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -38,6 +38,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct work_struct, lru_drain_work);
/*
* This path almost never happens for VM activity - pages are normally
@@ -312,6 +313,24 @@ int lru_add_drain_all(void)
}
/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all_async(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = &per_cpu(lru_drain_work, cpu);
+ schedule_work_on(cpu, work);
+ }
+ put_online_cpus();
+
+ return 0;
+}
+
+
+/*
* Batched page_cache_release(). Decrement the reference count on all the
* passed pages. If it fell to zero then remove the page from the LRU and
* free it.
@@ -497,6 +516,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
void __init swap_setup(void)
{
unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+ int cpu;
#ifdef CONFIG_SWAP
bdi_init(swapper_space.backing_dev_info);
@@ -511,4 +531,8 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+
+ for_each_possible_cpu(cpu) {
+ INIT_WORK(&per_cpu(lru_drain_work, cpu), lru_add_drain_per_cpu);
+ }
}
--
1.6.2.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 6/8] memcg: migrate charge of shmem
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (4 preceding siblings ...)
2009-09-17 2:26 ` [PATCH 5/8] memcg: migrate charge of anon Daisuke Nishimura
@ 2009-09-17 2:27 ` Daisuke Nishimura
2009-09-17 5:02 ` KAMEZAWA Hiroyuki
2009-09-17 2:28 ` [PATCH 7/8] memcg: migrate charge of swap Daisuke Nishimura
` (2 subsequent siblings)
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:27 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch adds some checks to enable migration charge of shmem(and mmapd tmpfs file).
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 830fa71..f46fd19 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2844,6 +2844,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
enum migrate_charge_type {
MIGRATE_CHARGE_ANON,
+ MIGRATE_CHARGE_SHMEM,
NR_MIGRATE_CHARGE_TYPE,
};
@@ -3210,6 +3211,8 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
pte_t *pte, ptent;
spinlock_t *ptl;
bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
+ bool move_shmem = (mc->to->migrate_charge &
+ (1 << MIGRATE_CHARGE_SHMEM));
lru_add_drain_all();
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -3226,6 +3229,8 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
if (PageAnon(page) && move_anon)
;
+ else if (!PageAnon(page) && PageSwapBacked(page) && move_shmem)
+ ;
else
continue;
@@ -3281,6 +3286,8 @@ static int migrate_charge_prepare(void)
int ret = 0;
struct mm_struct *mm;
struct vm_area_struct *vma;
+ bool move_shmem = (mc->to->migrate_charge &
+ (1 << MIGRATE_CHARGE_SHMEM));
mm = get_task_mm(mc->tsk);
if (!mm)
@@ -3299,8 +3306,7 @@ static int migrate_charge_prepare(void)
}
if (is_vm_hugetlb_page(vma))
continue;
- /* We migrate charge of private pages for now */
- if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE) && !move_shmem)
continue;
if (mc->to->migrate_charge) {
ret = walk_page_range(vma->vm_start, vma->vm_end,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 6/8] memcg: migrate charge of shmem
2009-09-17 2:27 ` [PATCH 6/8] memcg: migrate charge of shmem Daisuke Nishimura
@ 2009-09-17 5:02 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 5:02 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:27:37 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch adds some checks to enable migration charge of shmem(and mmapd tmpfs file).
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Hmm...move shmem ? Ah, maybe this patch itself is not bad.
(I don't like move_shmem flag ;)
Thinking more, "file cache" should be able to moved.
Shouldn't we implement
sys_madvice(REACCOUNT_PAGE_MEMCG)
or some ?
Then, we can isolate a big file cache/shmem.
Thanks.
-Kmae
> ---
> mm/memcontrol.c | 10 ++++++++--
> 1 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 830fa71..f46fd19 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2844,6 +2844,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
>
> enum migrate_charge_type {
> MIGRATE_CHARGE_ANON,
> + MIGRATE_CHARGE_SHMEM,
> NR_MIGRATE_CHARGE_TYPE,
> };
>
> @@ -3210,6 +3211,8 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> pte_t *pte, ptent;
> spinlock_t *ptl;
> bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
> + bool move_shmem = (mc->to->migrate_charge &
> + (1 << MIGRATE_CHARGE_SHMEM));
>
> lru_add_drain_all();
> pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> @@ -3226,6 +3229,8 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
>
> if (PageAnon(page) && move_anon)
> ;
> + else if (!PageAnon(page) && PageSwapBacked(page) && move_shmem)
> + ;
> else
> continue;
>
> @@ -3281,6 +3286,8 @@ static int migrate_charge_prepare(void)
> int ret = 0;
> struct mm_struct *mm;
> struct vm_area_struct *vma;
> + bool move_shmem = (mc->to->migrate_charge &
> + (1 << MIGRATE_CHARGE_SHMEM));
>
> mm = get_task_mm(mc->tsk);
> if (!mm)
> @@ -3299,8 +3306,7 @@ static int migrate_charge_prepare(void)
> }
> if (is_vm_hugetlb_page(vma))
> continue;
> - /* We migrate charge of private pages for now */
> - if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE))
> + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE) && !move_shmem)
> continue;
> if (mc->to->migrate_charge) {
> ret = walk_page_range(vma->vm_start, vma->vm_end,
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 7/8] memcg: migrate charge of swap
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (5 preceding siblings ...)
2009-09-17 2:27 ` [PATCH 6/8] memcg: migrate charge of shmem Daisuke Nishimura
@ 2009-09-17 2:28 ` Daisuke Nishimura
2009-09-17 5:25 ` KAMEZAWA Hiroyuki
2009-09-17 2:29 ` [PATCH 8/8] memcg: avoid oom during charge migration Daisuke Nishimura
2009-09-17 7:01 ` [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:28 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch is another core part of this charge migration feature.
It enables charge migration of swap.
Unlike mapped page, swaps of anonymous pages have its entry stored in the pte.
So this patch calls read_swap_cache_async() and do the same thing about the swap-in'ed
page as anonymous pages in migrate_charge_prepare_pte_range(), and handles !PageCgroupUsed
case in mem_cgroup_migrate_charge().
To exchange swap_cgroup's record safely, this patch changes swap_cgroup_record()
to use xchg, and define new function to cmpxchg swap_cgroup's record.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/page_cgroup.h | 2 +
mm/memcontrol.c | 127 +++++++++++++++++++++++++++++++++++++++++--
mm/page_cgroup.c | 35 +++++++++++-
3 files changed, 158 insertions(+), 6 deletions(-)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 321f037..6bf83f7 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -122,6 +122,8 @@ static inline void __init page_cgroup_init_flatmem(void)
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
#include <linux/swap.h>
+extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f46fd19..c8542e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -34,6 +34,7 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
@@ -1982,6 +1983,49 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
}
rcu_read_unlock();
}
+
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ *
+ * It successes only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, 1 on failure.
+ *
+ * The caller must have called __mem_cgroup_try_charge on @to.
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ unsigned short old_id, new_id;
+
+ old_id = css_id(&from->css);
+ new_id = css_id(&to->css);
+
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ if (!mem_cgroup_is_root(from))
+ res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+ mem_cgroup_swap_statistics(from, false);
+ mem_cgroup_put(from);
+
+ if (!mem_cgroup_is_root(to))
+ res_counter_uncharge(&to->res, PAGE_SIZE, NULL);
+ mem_cgroup_swap_statistics(to, true);
+ mem_cgroup_get(to);
+
+ return 0;
+ }
+ return 1;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ return 0;
+}
#endif
/*
@@ -2845,6 +2889,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
enum migrate_charge_type {
MIGRATE_CHARGE_ANON,
MIGRATE_CHARGE_SHMEM,
+ MIGRATE_CHARGE_SWAP,
NR_MIGRATE_CHARGE_TYPE,
};
@@ -3213,6 +3258,17 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
bool move_shmem = (mc->to->migrate_charge &
(1 << MIGRATE_CHARGE_SHMEM));
+ bool move_swap = do_swap_account &&
+ (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_SWAP));
+ swp_entry_t *table = NULL;
+ int idx = 0;
+
+ if (move_swap) {
+ table = kmalloc(sizeof(swp_entry_t) * PTRS_PER_PTE, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ memset(table, 0, sizeof(swp_entry_t) * PTRS_PER_PTE);
+ }
lru_add_drain_all();
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -3220,8 +3276,29 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
struct page_cgroup *pc;
ptent = *pte;
- if (!pte_present(ptent))
+ if (!pte_present(ptent)) {
+ swp_entry_t ent = { .val = 0 };
+
+ if (!move_swap)
+ continue;
+
+ /* TODO: handle swap of shmes/tmpfs */
+ if (pte_none(ptent) || pte_file(ptent))
+ continue;
+ else if (is_swap_pte(ptent) && move_anon)
+ ent = pte_to_swp_entry(ptent);
+
+ if (ent.val == 0)
+ continue;
+ if (is_migration_entry(ent))
+ continue;
+ if (css_id(&mc->from->css) != lookup_swap_cgroup(ent))
+ continue;
+
+ swap_duplicate(ent); /* freed later */
+ table[idx++] = ent;
continue;
+ }
page = vm_normal_page(vma, addr, ptent);
if (!page)
@@ -3253,6 +3330,24 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
+ if (table) {
+ int i;
+ for (i = 0; i < idx; i++) {
+ page = read_swap_cache_async(table[i],
+ GFP_HIGHUSER_MOVABLE, vma, 0);
+ swap_free(table[i]);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ lru_add_drain();
+ if (!isolate_lru_page(page))
+ list_add_tail(&page->lru, &list);
+ else
+ put_page(page);
+ cond_resched();
+ }
+ }
if (!list_empty(&list))
list_for_each_entry_safe(page, tmp, &list, lru) {
struct mem_cgroup *mem = mc->to;
@@ -3263,10 +3358,10 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
list_move_tail(&page->lru, &mc->list);
cond_resched();
}
-
+out:
/*
* We should put back all pages which remain on "list".
- * This means try_charge above has failed.
+ * This means try_charge or read_swap_cache_async above has failed.
* Pages which have been moved to mc->list would be put back at
* clear_migrate_charge.
*/
@@ -3277,7 +3372,7 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
put_page(page);
}
}
-
+ kfree(table);
return ret;
}
@@ -3391,10 +3486,14 @@ static void mem_cgroup_migrate_charge(void)
{
struct page *page, *tmp;
struct page_cgroup *pc;
+ bool move_swap;
if (!mc)
return;
+ move_swap = do_swap_account &&
+ (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_SWAP));
+
if (!list_empty(&mc->list))
list_for_each_entry_safe(page, tmp, &mc->list, lru) {
pc = lookup_page_cgroup(page);
@@ -3406,7 +3505,27 @@ static void mem_cgroup_migrate_charge(void)
list_del(&page->lru);
putback_lru_page(page);
put_page(page);
+ } else if (!PageCgroupUsed(pc) && move_swap) {
+ /*
+ * we can't call lock_page() under
+ * page_cgroup lock.
+ */
+ if (!trylock_page(page))
+ goto out;
+ if (PageSwapCache(page)) {
+ swp_entry_t ent;
+ ent.val = page_private(page);
+ if (!mem_cgroup_move_swap_account(ent,
+ mc->from, mc->to)) {
+ css_put(&mc->to->css);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ put_page(page);
+ }
+ }
+ unlock_page(page);
}
+out:
unlock_page_cgroup(pc);
cond_resched();
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d5..9532169 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <asm/cmpxchg.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -335,6 +336,37 @@ not_enough_page:
}
/**
+ * swap_cgroupo_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @end: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup useing 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ int type = swp_type(ent);
+ unsigned long offset = swp_offset(ent);
+ unsigned long idx = offset / SC_PER_PAGE;
+ unsigned long pos = offset & SC_POS_MASK;
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ mappage = ctrl->map[idx];
+ sc = page_address(mappage);
+ sc += pos;
+ if (cmpxchg(&sc->id, old, new) == old)
+ return old;
+ else
+ return 0;
+}
+
+/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
* @mem: mem_cgroup to be recorded
@@ -358,8 +390,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
mappage = ctrl->map[idx];
sc = page_address(mappage);
sc += pos;
- old = sc->id;
- sc->id = id;
+ old = xchg(&sc->id, id);
return old;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 7/8] memcg: migrate charge of swap
2009-09-17 2:28 ` [PATCH 7/8] memcg: migrate charge of swap Daisuke Nishimura
@ 2009-09-17 5:25 ` KAMEZAWA Hiroyuki
2009-09-17 6:17 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 5:25 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 11:28:17 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch is another core part of this charge migration feature.
> It enables charge migration of swap.
>
> Unlike mapped page, swaps of anonymous pages have its entry stored in the pte.
> So this patch calls read_swap_cache_async() and do the same thing about the swap-in'ed
> page as anonymous pages in migrate_charge_prepare_pte_range(), and handles !PageCgroupUsed
> case in mem_cgroup_migrate_charge().
Hmmm.....do we really need to do swap-in ? I think no.
>
> To exchange swap_cgroup's record safely, this patch changes swap_cgroup_record()
> to use xchg, and define new function to cmpxchg swap_cgroup's record.
>
I think this is enough.
BTW, it's not very bad to do this exchange under swap_lock. (if charge is done.)
Then, the whole logic can be simple.
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> include/linux/page_cgroup.h | 2 +
> mm/memcontrol.c | 127 +++++++++++++++++++++++++++++++++++++++++--
> mm/page_cgroup.c | 35 +++++++++++-
> 3 files changed, 158 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 321f037..6bf83f7 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -122,6 +122,8 @@ static inline void __init page_cgroup_init_flatmem(void)
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> #include <linux/swap.h>
> +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> + unsigned short old, unsigned short new);
> extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
> extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
> extern int swap_cgroup_swapon(int type, unsigned long max_pages);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f46fd19..c8542e7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -34,6 +34,7 @@
> #include <linux/rbtree.h>
> #include <linux/slab.h>
> #include <linux/swap.h>
> +#include <linux/swapops.h>
> #include <linux/spinlock.h>
> #include <linux/fs.h>
> #include <linux/seq_file.h>
> @@ -1982,6 +1983,49 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
> }
> rcu_read_unlock();
> }
> +
> +/**
> + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
> + * @entry: swap entry to be moved
> + * @from: mem_cgroup which the entry is moved from
> + * @to: mem_cgroup which the entry is moved to
> + *
> + * It successes only when the swap_cgroup's record for this entry is the same
> + * as the mem_cgroup's id of @from.
> + *
> + * Returns 0 on success, 1 on failure.
> + *
> + * The caller must have called __mem_cgroup_try_charge on @to.
> + */
> +static int mem_cgroup_move_swap_account(swp_entry_t entry,
> + struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> + unsigned short old_id, new_id;
> +
> + old_id = css_id(&from->css);
> + new_id = css_id(&to->css);
> +
> + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
> + if (!mem_cgroup_is_root(from))
> + res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
> + mem_cgroup_swap_statistics(from, false);
> + mem_cgroup_put(from);
> +
> + if (!mem_cgroup_is_root(to))
> + res_counter_uncharge(&to->res, PAGE_SIZE, NULL);
> + mem_cgroup_swap_statistics(to, true);
> + mem_cgroup_get(to);
> +
> + return 0;
> + }
> + return 1;
> +}
> +#else
> +static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
> + struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> + return 0;
> +}
> #endif
>
> /*
> @@ -2845,6 +2889,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> enum migrate_charge_type {
> MIGRATE_CHARGE_ANON,
> MIGRATE_CHARGE_SHMEM,
> + MIGRATE_CHARGE_SWAP,
> NR_MIGRATE_CHARGE_TYPE,
> };
>
> @@ -3213,6 +3258,17 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> bool move_anon = (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_ANON));
> bool move_shmem = (mc->to->migrate_charge &
> (1 << MIGRATE_CHARGE_SHMEM));
> + bool move_swap = do_swap_account &&
> + (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_SWAP));
> + swp_entry_t *table = NULL;
> + int idx = 0;
> +
> + if (move_swap) {
> + table = kmalloc(sizeof(swp_entry_t) * PTRS_PER_PTE, GFP_KERNEL);
> + if (!table)
> + return -ENOMEM;
> + memset(table, 0, sizeof(swp_entry_t) * PTRS_PER_PTE);
> + }
>
> lru_add_drain_all();
> pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> @@ -3220,8 +3276,29 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> struct page_cgroup *pc;
>
> ptent = *pte;
> - if (!pte_present(ptent))
> + if (!pte_present(ptent)) {
> + swp_entry_t ent = { .val = 0 };
> +
> + if (!move_swap)
> + continue;
> +
> + /* TODO: handle swap of shmes/tmpfs */
> + if (pte_none(ptent) || pte_file(ptent))
> + continue;
> + else if (is_swap_pte(ptent) && move_anon)
> + ent = pte_to_swp_entry(ptent);
> +
> + if (ent.val == 0)
> + continue;
> + if (is_migration_entry(ent))
> + continue;
> + if (css_id(&mc->from->css) != lookup_swap_cgroup(ent))
> + continue;
> +
> + swap_duplicate(ent); /* freed later */
> + table[idx++] = ent;
> continue;
> + }
>
> page = vm_normal_page(vma, addr, ptent);
> if (!page)
> @@ -3253,6 +3330,24 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> pte_unmap_unlock(pte - 1, ptl);
> cond_resched();
>
> + if (table) {
> + int i;
> + for (i = 0; i < idx; i++) {
> + page = read_swap_cache_async(table[i],
> + GFP_HIGHUSER_MOVABLE, vma, 0);
Hmm...I think this should be..
page = lookup_swap_cache(table[i]);
..
if (page) {
isolate this page. and put into the list if PageCgroupUsed().
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup )
charge against this.
mc->charged++;
}
/* If !page or page is not accounted */
if (!mem) {
id = lookup_swap_cgroup(ent);
account aganst "id"
mc->swap_charged++.
}
...
IIUC, we do exchange in swap_cgroup[] under swap_lock(), we can do safe exchange
of account because we can know there are swapcache or not by swap_map.
(see swap_has_cache())
Anyway, I never want to see this swapin ;(
Thanks,
-Kame
> + swap_free(table[i]);
> + if (!page) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + lru_add_drain();
> + if (!isolate_lru_page(page))
> + list_add_tail(&page->lru, &list);
> + else
> + put_page(page);
> + cond_resched();
> + }
> + }
> if (!list_empty(&list))
> list_for_each_entry_safe(page, tmp, &list, lru) {
> struct mem_cgroup *mem = mc->to;
> @@ -3263,10 +3358,10 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> list_move_tail(&page->lru, &mc->list);
> cond_resched();
> }
> -
> +out:
> /*
> * We should put back all pages which remain on "list".
> - * This means try_charge above has failed.
> + * This means try_charge or read_swap_cache_async above has failed.
> * Pages which have been moved to mc->list would be put back at
> * clear_migrate_charge.
> */
> @@ -3277,7 +3372,7 @@ static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> put_page(page);
> }
> }
> -
> + kfree(table);
> return ret;
> }
>
> @@ -3391,10 +3486,14 @@ static void mem_cgroup_migrate_charge(void)
> {
> struct page *page, *tmp;
> struct page_cgroup *pc;
> + bool move_swap;
>
> if (!mc)
> return;
>
> + move_swap = do_swap_account &&
> + (mc->to->migrate_charge & (1 << MIGRATE_CHARGE_SWAP));
> +
> if (!list_empty(&mc->list))
> list_for_each_entry_safe(page, tmp, &mc->list, lru) {
> pc = lookup_page_cgroup(page);
> @@ -3406,7 +3505,27 @@ static void mem_cgroup_migrate_charge(void)
> list_del(&page->lru);
> putback_lru_page(page);
> put_page(page);
> + } else if (!PageCgroupUsed(pc) && move_swap) {
> + /*
> + * we can't call lock_page() under
> + * page_cgroup lock.
> + */
> + if (!trylock_page(page))
> + goto out;
> + if (PageSwapCache(page)) {
> + swp_entry_t ent;
> + ent.val = page_private(page);
> + if (!mem_cgroup_move_swap_account(ent,
> + mc->from, mc->to)) {
> + css_put(&mc->to->css);
> + list_del(&page->lru);
> + putback_lru_page(page);
> + put_page(page);
> + }
> + }
> + unlock_page(page);
> }
> +out:
> unlock_page_cgroup(pc);
> cond_resched();
> }
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 3d535d5..9532169 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -9,6 +9,7 @@
> #include <linux/vmalloc.h>
> #include <linux/cgroup.h>
> #include <linux/swapops.h>
> +#include <asm/cmpxchg.h>
>
> static void __meminit
> __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> @@ -335,6 +336,37 @@ not_enough_page:
> }
>
> /**
> + * swap_cgroupo_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
> + * @end: swap entry to be cmpxchged
> + * @old: old id
> + * @new: new id
> + *
> + * Returns old id at success, 0 at failure.
> + * (There is no mem_cgroup useing 0 as its id)
> + */
> +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> + unsigned short old, unsigned short new)
> +{
> + int type = swp_type(ent);
> + unsigned long offset = swp_offset(ent);
> + unsigned long idx = offset / SC_PER_PAGE;
> + unsigned long pos = offset & SC_POS_MASK;
> + struct swap_cgroup_ctrl *ctrl;
> + struct page *mappage;
> + struct swap_cgroup *sc;
> +
> + ctrl = &swap_cgroup_ctrl[type];
> +
> + mappage = ctrl->map[idx];
> + sc = page_address(mappage);
> + sc += pos;
> + if (cmpxchg(&sc->id, old, new) == old)
> + return old;
> + else
> + return 0;
> +}
> +
> +/**
> * swap_cgroup_record - record mem_cgroup for this swp_entry.
> * @ent: swap entry to be recorded into
> * @mem: mem_cgroup to be recorded
> @@ -358,8 +390,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> mappage = ctrl->map[idx];
> sc = page_address(mappage);
> sc += pos;
> - old = sc->id;
> - sc->id = id;
> + old = xchg(&sc->id, id);
>
> return old;
> }
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [PATCH 7/8] memcg: migrate charge of swap
2009-09-17 5:25 ` KAMEZAWA Hiroyuki
@ 2009-09-17 6:17 ` Daisuke Nishimura
2009-09-17 6:28 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 6:17 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 14:25:58 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 17 Sep 2009 11:28:17 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch is another core part of this charge migration feature.
> > It enables charge migration of swap.
> >
> > Unlike mapped page, swaps of anonymous pages have its entry stored in the pte.
> > So this patch calls read_swap_cache_async() and do the same thing about the swap-in'ed
> > page as anonymous pages in migrate_charge_prepare_pte_range(), and handles !PageCgroupUsed
> > case in mem_cgroup_migrate_charge().
>
> Hmmm.....do we really need to do swap-in ? I think no.
>
I do agree with you.
I did swap-in just to remember the target pages(I cannot find a good way to remember all of
target swap entries).
If we go parse-pagetable-twice direction as mentioned in another mail,
list would be unnecessary anyway.
> >
> > To exchange swap_cgroup's record safely, this patch changes swap_cgroup_record()
> > to use xchg, and define new function to cmpxchg swap_cgroup's record.
> >
> I think this is enough.
>
Agreed.
> BTW, it's not very bad to do this exchange under swap_lock. (if charge is done.)
> Then, the whole logic can be simple.
>
Current memcg in mmotm calls swap_cgroup_record() under swap_lock except
__mem_cgroup_commit_charge_swapin().
Instead of doing all of it under swap_lock, I choose lockless(cmpxchg) implementation.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH 7/8] memcg: migrate charge of swap
2009-09-17 6:17 ` Daisuke Nishimura
@ 2009-09-17 6:28 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-17 6:28 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 17 Sep 2009 15:17:38 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > BTW, it's not very bad to do this exchange under swap_lock. (if charge is done.)
> > Then, the whole logic can be simple.
> >
> Current memcg in mmotm calls swap_cgroup_record() under swap_lock except
> __mem_cgroup_commit_charge_swapin().
> Instead of doing all of it under swap_lock, I choose lockless(cmpxchg) implementation.
>
>
Ah, sorry for my short word.
IIUC, we guarantee atomic swap charge/uncharge operation by
lock_page() .....if there are swap cache
swap_lock() .....if there are no swap cache.
Then, using swap_lock here can be a choice.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH 8/8] memcg: avoid oom during charge migration
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (6 preceding siblings ...)
2009-09-17 2:28 ` [PATCH 7/8] memcg: migrate charge of swap Daisuke Nishimura
@ 2009-09-17 2:29 ` Daisuke Nishimura
2009-09-17 7:01 ` [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
8 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 2:29 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This charge migration feature has double charges on both "from" and "to" mem_cgroup
during charge migration.
This means unnecessary oom can happen because of charge migration.
This patch tries to avoid such oom.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 18 ++++++++++++++++++
1 files changed, 18 insertions(+), 0 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c8542e7..73da7e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -288,6 +288,8 @@ struct migrate_charge {
struct list_head list;
};
static struct migrate_charge *mc;
+static struct task_struct *mc_task;
+static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -1318,6 +1320,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
while (1) {
int ret = 0;
unsigned long flags = 0;
+ DEFINE_WAIT(wait);
if (mem_cgroup_is_root(mem))
goto done;
@@ -1359,6 +1362,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (mem_cgroup_check_under_limit(mem_over_limit))
continue;
+ /* try to avoid oom while someone is migrating charge */
+ if (current != mc_task) {
+ prepare_to_wait(&mc_waitq, &wait, TASK_INTERRUPTIBLE);
+ if (mc) {
+ schedule();
+ finish_wait(&mc_waitq, &wait);
+ continue;
+ }
+ finish_wait(&mc_waitq, &wait);
+ }
+
if (!nr_retries--) {
if (oom) {
mutex_lock(&memcg_tasklist);
@@ -3432,6 +3446,8 @@ static void mem_cgroup_clear_migrate_charge(void)
kfree(mc);
mc = NULL;
+ mc_task = NULL;
+ wake_up_all(&mc_waitq);
}
static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
@@ -3441,6 +3457,7 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
struct mem_cgroup *from = mem_cgroup_from_task(p);
VM_BUG_ON(mc);
+ VM_BUG_ON(mc_task);
if (from == mem)
return 0;
@@ -3453,6 +3470,7 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
mc->from = from;
mc->to = mem;
INIT_LIST_HEAD(&mc->list);
+ mc_task = current;
ret = migrate_charge_prepare();
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move
2009-09-17 2:23 [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
` (7 preceding siblings ...)
2009-09-17 2:29 ` [PATCH 8/8] memcg: avoid oom during charge migration Daisuke Nishimura
@ 2009-09-17 7:01 ` Daisuke Nishimura
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
8 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-17 7:01 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
On Thu, 17 Sep 2009 11:23:04 +0900, Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> Hi.
>
> These are patches for migrating memcg's charge at task move.
>
> I know we should fix res_counter's scalability problem first,
> but this feature is also important for me, so I tried making patches
> and they seem to work basically.
> I post them(based on mmotm-2009-09-14-01-57) before going further to get some feedbacks.
>
>
> Basic design:
> - Add flag file "memory.migrate_charge" to determine whether charges should be
> migrated or not. Each bit of "memory.migrate_charge" has meaning(indicate the
> type of pages the charges of which should be migrated).
> - At can_attach(), isolate pages of the task, call __mem_cgroup_try_charge,
> and move them to a private list.
> - Call mem_cgroup_move_account() at attach() about all pages on the private list
> after necessary checks under page_cgroup lock, and put back them to LRU.
> - Cancel charges about all pages remains on the private list on failure or at the end
> of charge migration, and put back them to LRU.
>
>
> I think this design is simple but it has a problem when mounted on the same hierarchy
> with cpuset. This design isolate pages of the task at can_attach(), but attach() of cpuset
> also tries to isolate pages of the task and migrate them if cpuset.memory_migrate is set.
> As a result, pages cannot be memory migrated by cpuset if we set memory.migrate_charge.
> But I think this problem can be handled by user-space to some extent: for example,
> move the task back and move it again with memory.migrate_charge unset.
> So I went to this direction as a first step(I'm considering how to avoid this issue).
>
>
As I said in another mail, I'll change the design.
- At can_attach(), parse the page table of the task, do try_charge, and remember the
count of try_charge(no page isolation).
- At attach, parse the page table again and do move_account.
So the problem above with cpuset disappears.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep)
2009-09-17 7:01 ` [RFC][EXPERIMENTAL][PATCH 0/8] memcg: migrate charge at task move Daisuke Nishimura
@ 2009-09-24 5:42 ` Daisuke Nishimura
2009-09-24 5:43 ` [RFC][PATCH 1/8] cgroup: introduce cancel_attach() Daisuke Nishimura
` (7 more replies)
0 siblings, 8 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:42 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
I send out latest version just to share current code.
They seem to work fine in my test.
But I'm not in hurry for now, please see them when you have time.
Major differences from the previous version:
- changed "migrate_charge" flag from "int" to "bool".
- In can_attach(), parse the page table of the task and count only the number
of target ptes and call try_charge() repeatedly. No isolation at this phase.
- In attach(), parse the page table of the task again, and isolate the target
page and call move_account() one by one.
- do no swap-in in moving swap account.
- add support for shmem/tmpfs's swap.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* [RFC][PATCH 1/8] cgroup: introduce cancel_attach()
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
@ 2009-09-24 5:43 ` Daisuke Nishimura
2009-09-24 6:33 ` KAMEZAWA Hiroyuki
2009-09-24 5:44 ` [RFC][PATCH 2/8] memcg: introduce mem_cgroup_cancel_charge() Daisuke Nishimura
` (6 subsequent siblings)
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:43 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch adds cancel_attach() operation to struct cgroup_subsys.
cancel_attach() can be used when can_attach() operation prepares something
for the subsys, but we should discard what can_attach() operation has prepared
if attach task/proc fails afterwards.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
Documentation/cgroups/cgroups.txt | 12 ++++++++++++
include/linux/cgroup.h | 2 ++
kernel/cgroup.c | 36 ++++++++++++++++++++++++++++--------
3 files changed, 42 insertions(+), 8 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 3df4b9a..07bb678 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -544,6 +544,18 @@ remain valid while the caller holds cgroup_mutex. If threadgroup is
true, then a successful result indicates that all threads in the given
thread's threadgroup can be moved together.
+void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *task, bool threadgroup)
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+For example, this will be called if some subsystems are mounted on the same
+hierarchy, can_attach() operations have succeeded about part of the subsystems,
+but has failed about next subsystem. This will be called only about subsystems
+whose can_attach() operation has succeeded. This may be useful for subsystems
+which prepare something in can_attach() operation but should discard what has
+been prepared on failure.
+
void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *task,
bool threadgroup)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 642a47f..a08edbc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -429,6 +429,8 @@ struct cgroup_subsys {
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk, bool threadgroup);
+ void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk, bool threadgroup);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *tsk,
bool threadgroup);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7da6004..2d9a808 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1700,7 +1700,7 @@ void threadgroup_fork_unlock(struct sighand_struct *sighand)
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss, *fail = NULL;
struct cgroup *oldcgrp;
struct cgroupfs_root *root = cgrp->root;
@@ -1712,14 +1712,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, tsk, false);
- if (retval)
- return retval;
+ if (retval) {
+ fail = ss;
+ goto out;
+ }
}
}
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
if (retval)
- return retval;
+ goto out;
for_each_subsys(root, ss) {
if (ss->attach)
@@ -1733,7 +1735,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiter(cgrp);
- return 0;
+out:
+ if (retval)
+ for_each_subsys(root, ss) {
+ if (ss == fail)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, tsk, false);
+ }
+ return retval;
}
/*
@@ -1813,7 +1823,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
{
int retval;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss, *fail = NULL;
struct cgroup *oldcgrp;
struct css_set *oldcg;
struct cgroupfs_root *root = cgrp->root;
@@ -1839,8 +1849,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, leader, true);
- if (retval)
- return retval;
+ if (retval) {
+ fail = ss;
+ goto out;
+ }
}
}
@@ -1978,6 +1990,14 @@ list_teardown:
put_css_set(cg_entry->cg);
kfree(cg_entry);
}
+out:
+ if (retval)
+ for_each_subsys(root, ss) {
+ if (ss == fail)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, tsk, true);
+ }
/* done! */
return retval;
}
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 1/8] cgroup: introduce cancel_attach()
2009-09-24 5:43 ` [RFC][PATCH 1/8] cgroup: introduce cancel_attach() Daisuke Nishimura
@ 2009-09-24 6:33 ` KAMEZAWA Hiroyuki
2009-09-24 23:39 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 6:33 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:43:27 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch adds cancel_attach() operation to struct cgroup_subsys.
> cancel_attach() can be used when can_attach() operation prepares something
> for the subsys, but we should discard what can_attach() operation has prepared
> if attach task/proc fails afterwards.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> Documentation/cgroups/cgroups.txt | 12 ++++++++++++
> include/linux/cgroup.h | 2 ++
> kernel/cgroup.c | 36 ++++++++++++++++++++++++++++--------
> 3 files changed, 42 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index 3df4b9a..07bb678 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -544,6 +544,18 @@ remain valid while the caller holds cgroup_mutex. If threadgroup is
> true, then a successful result indicates that all threads in the given
> thread's threadgroup can be moved together.
>
> +void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> + struct task_struct *task, bool threadgroup)
> +(cgroup_mutex held by caller)
> +
> +Called when a task attach operation has failed after can_attach() has succeeded.
> +For example, this will be called if some subsystems are mounted on the same
> +hierarchy, can_attach() operations have succeeded about part of the subsystems,
> +but has failed about next subsystem. This will be called only about subsystems
> +whose can_attach() operation has succeeded. This may be useful for subsystems
> +which prepare something in can_attach() operation but should discard what has
> +been prepared on failure.
> +
Hmm..I'd like to add a text like this ..
==
+A subsystem whose can_attach() has some side-effects should provide this function.
+Then, the subsytem can implement a rollback. If not, not necessary.
==
> void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> struct cgroup *old_cgrp, struct task_struct *task,
> bool threadgroup)
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 642a47f..a08edbc 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -429,6 +429,8 @@ struct cgroup_subsys {
> void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> struct task_struct *tsk, bool threadgroup);
> + void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> + struct task_struct *tsk, bool threadgroup);
> void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> struct cgroup *old_cgrp, struct task_struct *tsk,
> bool threadgroup);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 7da6004..2d9a808 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1700,7 +1700,7 @@ void threadgroup_fork_unlock(struct sighand_struct *sighand)
> int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> {
> int retval;
> - struct cgroup_subsys *ss;
> + struct cgroup_subsys *ss, *fail = NULL;
> struct cgroup *oldcgrp;
> struct cgroupfs_root *root = cgrp->root;
>
> @@ -1712,14 +1712,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> for_each_subsys(root, ss) {
> if (ss->can_attach) {
> retval = ss->can_attach(ss, cgrp, tsk, false);
> - if (retval)
> - return retval;
> + if (retval) {
> + fail = ss;
> + goto out;
> + }
> }
> }
>
> retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
> if (retval)
> - return retval;
> + goto out;
>
Hmm...maybe we don't have this code in the latest tree.
Ah...ok, this is from
cgroups-add-ability-to-move-all-threads-in-a-process-to-a-new-cgroup-atomically
.patch
which is now hidden.
> for_each_subsys(root, ss) {
> if (ss->attach)
> @@ -1733,7 +1735,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> * is no longer empty.
> */
> cgroup_wakeup_rmdir_waiter(cgrp);
> - return 0;
> +out:
> + if (retval)
> + for_each_subsys(root, ss) {
> + if (ss == fail)
> + break;
> + if (ss->cancel_attach)
> + ss->cancel_attach(ss, cgrp, tsk, false);
> + }
> + return retval;
> }
>
> /*
> @@ -1813,7 +1823,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> {
> int retval;
> - struct cgroup_subsys *ss;
> + struct cgroup_subsys *ss, *fail = NULL;
> struct cgroup *oldcgrp;
> struct css_set *oldcg;
> struct cgroupfs_root *root = cgrp->root;
> @@ -1839,8 +1849,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> for_each_subsys(root, ss) {
> if (ss->can_attach) {
> retval = ss->can_attach(ss, cgrp, leader, true);
> - if (retval)
> - return retval;
> + if (retval) {
> + fail = ss;
> + goto out;
> + }
> }
> }
>
> @@ -1978,6 +1990,14 @@ list_teardown:
> put_css_set(cg_entry->cg);
> kfree(cg_entry);
> }
> +out:
> + if (retval)
> + for_each_subsys(root, ss) {
> + if (ss == fail)
> + break;
> + if (ss->cancel_attach)
> + ss->cancel_attach(ss, cgrp, tsk, true);
> + }
> /* done! */
> return retval;
> }
No objections from me. just wait for comments from Paul or Li.
I wonder if we add cancel_attach(), can_attach() should be renamed to
prepare_attach() or some. ;)
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 1/8] cgroup: introduce cancel_attach()
2009-09-24 6:33 ` KAMEZAWA Hiroyuki
@ 2009-09-24 23:39 ` Daisuke Nishimura
0 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 23:39 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 15:33:09 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 24 Sep 2009 14:43:27 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch adds cancel_attach() operation to struct cgroup_subsys.
> > cancel_attach() can be used when can_attach() operation prepares something
> > for the subsys, but we should discard what can_attach() operation has prepared
> > if attach task/proc fails afterwards.
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> > Documentation/cgroups/cgroups.txt | 12 ++++++++++++
> > include/linux/cgroup.h | 2 ++
> > kernel/cgroup.c | 36 ++++++++++++++++++++++++++++--------
> > 3 files changed, 42 insertions(+), 8 deletions(-)
> >
> > diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> > index 3df4b9a..07bb678 100644
> > --- a/Documentation/cgroups/cgroups.txt
> > +++ b/Documentation/cgroups/cgroups.txt
> > @@ -544,6 +544,18 @@ remain valid while the caller holds cgroup_mutex. If threadgroup is
> > true, then a successful result indicates that all threads in the given
> > thread's threadgroup can be moved together.
> >
> > +void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > + struct task_struct *task, bool threadgroup)
> > +(cgroup_mutex held by caller)
> > +
> > +Called when a task attach operation has failed after can_attach() has succeeded.
> > +For example, this will be called if some subsystems are mounted on the same
> > +hierarchy, can_attach() operations have succeeded about part of the subsystems,
> > +but has failed about next subsystem. This will be called only about subsystems
> > +whose can_attach() operation has succeeded. This may be useful for subsystems
> > +which prepare something in can_attach() operation but should discard what has
> > +been prepared on failure.
> > +
>
> Hmm..I'd like to add a text like this ..
> ==
> +A subsystem whose can_attach() has some side-effects should provide this function.
> +Then, the subsytem can implement a rollback. If not, not necessary.
> ==
>
O.K.
will add in next post.
> > void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > struct cgroup *old_cgrp, struct task_struct *task,
> > bool threadgroup)
> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > index 642a47f..a08edbc 100644
> > --- a/include/linux/cgroup.h
> > +++ b/include/linux/cgroup.h
> > @@ -429,6 +429,8 @@ struct cgroup_subsys {
> > void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> > int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > struct task_struct *tsk, bool threadgroup);
> > + void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > + struct task_struct *tsk, bool threadgroup);
> > void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > struct cgroup *old_cgrp, struct task_struct *tsk,
> > bool threadgroup);
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 7da6004..2d9a808 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -1700,7 +1700,7 @@ void threadgroup_fork_unlock(struct sighand_struct *sighand)
> > int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > {
> > int retval;
> > - struct cgroup_subsys *ss;
> > + struct cgroup_subsys *ss, *fail = NULL;
> > struct cgroup *oldcgrp;
> > struct cgroupfs_root *root = cgrp->root;
> >
> > @@ -1712,14 +1712,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > for_each_subsys(root, ss) {
> > if (ss->can_attach) {
> > retval = ss->can_attach(ss, cgrp, tsk, false);
> > - if (retval)
> > - return retval;
> > + if (retval) {
> > + fail = ss;
> > + goto out;
> > + }
> > }
> > }
> >
> > retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
> > if (retval)
> > - return retval;
> > + goto out;
> >
>
> Hmm...maybe we don't have this code in the latest tree.
> Ah...ok, this is from
> cgroups-add-ability-to-move-all-threads-in-a-process-to-a-new-cgroup-atomically
> .patch
> which is now hidden.
>
Indeed.. these part are different now.
My patches are based on mmotm-2009-09-14-01-57, I'm sorry for cunfusing you.
Thanks,
Daisuke Nishimura.
>
>
> > for_each_subsys(root, ss) {
> > if (ss->attach)
> > @@ -1733,7 +1735,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > * is no longer empty.
> > */
> > cgroup_wakeup_rmdir_waiter(cgrp);
> > - return 0;
> > +out:
> > + if (retval)
> > + for_each_subsys(root, ss) {
> > + if (ss == fail)
> > + break;
> > + if (ss->cancel_attach)
> > + ss->cancel_attach(ss, cgrp, tsk, false);
> > + }
> > + return retval;
> > }
> >
> > /*
> > @@ -1813,7 +1823,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> > int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> > {
> > int retval;
> > - struct cgroup_subsys *ss;
> > + struct cgroup_subsys *ss, *fail = NULL;
> > struct cgroup *oldcgrp;
> > struct css_set *oldcg;
> > struct cgroupfs_root *root = cgrp->root;
> > @@ -1839,8 +1849,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> > for_each_subsys(root, ss) {
> > if (ss->can_attach) {
> > retval = ss->can_attach(ss, cgrp, leader, true);
> > - if (retval)
> > - return retval;
> > + if (retval) {
> > + fail = ss;
> > + goto out;
> > + }
> > }
> > }
> >
> > @@ -1978,6 +1990,14 @@ list_teardown:
> > put_css_set(cg_entry->cg);
> > kfree(cg_entry);
> > }
> > +out:
> > + if (retval)
> > + for_each_subsys(root, ss) {
> > + if (ss == fail)
> > + break;
> > + if (ss->cancel_attach)
> > + ss->cancel_attach(ss, cgrp, tsk, true);
> > + }
> > /* done! */
> > return retval;
> > }
>
> No objections from me. just wait for comments from Paul or Li.
>
> I wonder if we add cancel_attach(), can_attach() should be renamed to
> prepare_attach() or some. ;)
>
>
> Thanks,
> -Kame
>
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [RFC][PATCH 2/8] memcg: introduce mem_cgroup_cancel_charge()
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
2009-09-24 5:43 ` [RFC][PATCH 1/8] cgroup: introduce cancel_attach() Daisuke Nishimura
@ 2009-09-24 5:44 ` Daisuke Nishimura
2009-09-24 5:46 ` [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
` (5 subsequent siblings)
7 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:44 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
(This is the same patch which is merged into KAMEZAWA-san's set)
There are some places calling both res_counter_uncharge() and css_put()
to cancel the charge and the refcnt we have got by mem_cgroup_tyr_charge().
This patch introduces mem_cgroup_cancel_charge() and call it in those places.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 39 ++++++++++++++++++---------------------
1 files changed, 18 insertions(+), 21 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2b98a6..b2b68b4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1371,6 +1371,21 @@ nomem:
}
/*
+ * Somemtimes we have to undo a charge we got by try_charge().
+ * This function is for that and do uncharge, put css's refcnt.
+ * gotten by try_charge().
+ */
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+{
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ }
+ css_put(&mem->css);
+}
+
+/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
* it's concern. (dropping refcnt from swap can be called against removed
@@ -1436,13 +1451,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE,
- NULL);
- }
- css_put(&mem->css);
+ __mem_cgroup_cancel_charge(mem);
return;
}
@@ -1606,14 +1615,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
cancel:
put_page(page);
uncharge:
- /* drop extra refcnt by try_charge() */
- css_put(&parent->css);
- /* uncharge if move fails */
- if (!mem_cgroup_is_root(parent)) {
- res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
- }
+ __mem_cgroup_cancel_charge(parent);
return ret;
}
@@ -1830,12 +1832,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
- }
- css_put(&mem->css);
+ __mem_cgroup_cancel_charge(mem);
}
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent()
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
2009-09-24 5:43 ` [RFC][PATCH 1/8] cgroup: introduce cancel_attach() Daisuke Nishimura
2009-09-24 5:44 ` [RFC][PATCH 2/8] memcg: introduce mem_cgroup_cancel_charge() Daisuke Nishimura
@ 2009-09-24 5:46 ` Daisuke Nishimura
2009-09-24 6:37 ` KAMEZAWA Hiroyuki
2009-09-24 5:47 ` [RFC][PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
` (4 subsequent siblings)
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:46 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
mem_cgroup_move_parent() calls try_charge first and cancel_charge on failure.
IMHO, charge/uncharge(especially charge) is high cost operation, so we should
avoid it as far as possible.
This patch tries to delay try_charge in mem_cgroup_move_parent() by re-ordering
checks it does.
And this patch renames mem_cgroup_move_account() to __mem_cgroup_move_account(),
changes the return value of __mem_cgroup_move_account() from int to void,
and adds a new wrapper(mem_cgroup_move_account()), which checks whether a @pc
is valid for account move and calls __mem_cgroup_move_account().
This patch removes the last caller of trylock_page_cgroup(), so removes its
definition too.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/page_cgroup.h | 7 +--
mm/memcontrol.c | 92 ++++++++++++++++++++-----------------------
2 files changed, 45 insertions(+), 54 deletions(-)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 7a3627e..fffa835 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -57,6 +57,8 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
+TESTPCGFLAG(Locked, LOCK)
+
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)
CLEARPCGFLAG(Cache, CACHE)
@@ -86,11 +88,6 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
bit_spin_lock(PCG_LOCK, &pc->flags);
}
-static inline int trylock_page_cgroup(struct page_cgroup *pc)
-{
- return bit_spin_trylock(PCG_LOCK, &pc->flags);
-}
-
static inline void unlock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_unlock(PCG_LOCK, &pc->flags);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2b68b4..7e8874d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1484,27 +1484,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
}
/**
- * mem_cgroup_move_account - move account of the page
+ * __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- *
- * returns 0 at success,
- * returns -EBUSY when lock is busy or "pc" is unstable.
+ * - the pc is locked, used, and ->mem_cgroup points to @from.
*
* This function does "uncharge" from old cgroup but doesn't do "charge" to
* new cgroup. It should be done by a caller.
*/
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to)
{
- struct mem_cgroup_per_zone *from_mz, *to_mz;
- int nid, zid;
- int ret = -EBUSY;
struct page *page;
int cpu;
struct mem_cgroup_stat *stat;
@@ -1512,20 +1507,9 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
-
- nid = page_cgroup_nid(pc);
- zid = page_cgroup_zid(pc);
- from_mz = mem_cgroup_zoneinfo(from, nid, zid);
- to_mz = mem_cgroup_zoneinfo(to, nid, zid);
-
- if (!trylock_page_cgroup(pc))
- return ret;
-
- if (!PageCgroupUsed(pc))
- goto out;
-
- if (pc->mem_cgroup != from)
- goto out;
+ VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!PageCgroupUsed(pc));
+ VM_BUG_ON(pc->mem_cgroup != from);
if (!mem_cgroup_is_root(from))
res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
@@ -1554,15 +1538,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
css_get(&to->css);
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
- ret = 0;
-out:
- unlock_page_cgroup(pc);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and it's garanteed that
* "to" is never removed. So, we don't check rmdir status here.
*/
+}
+
+/*
+ * check whether the @pc is valid for account move and call
+ * __mem_cgroup_move_account()
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ int ret = -EINVAL;
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ __mem_cgroup_move_account(pc, from, to);
+ ret = 0;
+ }
+ unlock_page_cgroup(pc);
return ret;
}
@@ -1584,38 +1581,35 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (!pcg)
return -EINVAL;
+ ret = -EBUSY;
+ if (!get_page_unless_zero(page))
+ goto out;
+ if (isolate_lru_page(page))
+ goto put;
- parent = mem_cgroup_from_cont(pcg);
-
+ ret = -EINVAL;
+ lock_page_cgroup(pc);
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != child) { /* early check */
+ unlock_page_cgroup(pc);
+ goto put_back;
+ }
+ unlock_page_cgroup(pc);
+ parent = mem_cgroup_from_cont(pcg);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
if (ret || !parent)
- return ret;
-
- if (!get_page_unless_zero(page)) {
- ret = -EBUSY;
- goto uncharge;
- }
-
- ret = isolate_lru_page(page);
-
- if (ret)
- goto cancel;
+ goto put_back;
ret = mem_cgroup_move_account(pc, child, parent);
-
+ if (!ret)
+ css_put(&parent->css); /* drop extra refcnt by try_charge() */
+ else
+ __mem_cgroup_cancel_charge(parent); /* does css_put */
+put_back:
putback_lru_page(page);
- if (!ret) {
- put_page(page);
- /* drop extra refcnt by try_charge() */
- css_put(&parent->css);
- return 0;
- }
-
-cancel:
+put:
put_page(page);
-uncharge:
- __mem_cgroup_cancel_charge(parent);
+out:
return ret;
}
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent()
2009-09-24 5:46 ` [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
@ 2009-09-24 6:37 ` KAMEZAWA Hiroyuki
2009-09-24 6:54 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 6:37 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:46:02 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> mem_cgroup_move_parent() calls try_charge first and cancel_charge on failure.
> IMHO, charge/uncharge(especially charge) is high cost operation, so we should
> avoid it as far as possible.
>
> This patch tries to delay try_charge in mem_cgroup_move_parent() by re-ordering
> checks it does.
>
> And this patch renames mem_cgroup_move_account() to __mem_cgroup_move_account(),
> changes the return value of __mem_cgroup_move_account() from int to void,
> and adds a new wrapper(mem_cgroup_move_account()), which checks whether a @pc
> is valid for account move and calls __mem_cgroup_move_account().
>
> This patch removes the last caller of trylock_page_cgroup(), so removes its
> definition too.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> include/linux/page_cgroup.h | 7 +--
> mm/memcontrol.c | 92 ++++++++++++++++++++-----------------------
> 2 files changed, 45 insertions(+), 54 deletions(-)
>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7a3627e..fffa835 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -57,6 +57,8 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
> static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
> { return test_and_clear_bit(PCG_##lname, &pc->flags); }
>
> +TESTPCGFLAG(Locked, LOCK)
> +
> /* Cache flag is set only once (at allocation) */
> TESTPCGFLAG(Cache, CACHE)
> CLEARPCGFLAG(Cache, CACHE)
> @@ -86,11 +88,6 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
> bit_spin_lock(PCG_LOCK, &pc->flags);
> }
>
> -static inline int trylock_page_cgroup(struct page_cgroup *pc)
> -{
> - return bit_spin_trylock(PCG_LOCK, &pc->flags);
> -}
> -
> static inline void unlock_page_cgroup(struct page_cgroup *pc)
> {
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b2b68b4..7e8874d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1484,27 +1484,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
> }
>
> /**
> - * mem_cgroup_move_account - move account of the page
> + * __mem_cgroup_move_account - move account of the page
> * @pc: page_cgroup of the page.
> * @from: mem_cgroup which the page is moved from.
> * @to: mem_cgroup which the page is moved to. @from != @to.
> *
> * The caller must confirm following.
> * - page is not on LRU (isolate_page() is useful.)
> - *
> - * returns 0 at success,
> - * returns -EBUSY when lock is busy or "pc" is unstable.
> + * - the pc is locked, used, and ->mem_cgroup points to @from.
> *
> * This function does "uncharge" from old cgroup but doesn't do "charge" to
> * new cgroup. It should be done by a caller.
> */
>
> -static int mem_cgroup_move_account(struct page_cgroup *pc,
> +static void __mem_cgroup_move_account(struct page_cgroup *pc,
> struct mem_cgroup *from, struct mem_cgroup *to)
> {
> - struct mem_cgroup_per_zone *from_mz, *to_mz;
> - int nid, zid;
> - int ret = -EBUSY;
> struct page *page;
> int cpu;
> struct mem_cgroup_stat *stat;
> @@ -1512,20 +1507,9 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>
> VM_BUG_ON(from == to);
> VM_BUG_ON(PageLRU(pc->page));
> -
> - nid = page_cgroup_nid(pc);
> - zid = page_cgroup_zid(pc);
> - from_mz = mem_cgroup_zoneinfo(from, nid, zid);
> - to_mz = mem_cgroup_zoneinfo(to, nid, zid);
> -
> - if (!trylock_page_cgroup(pc))
> - return ret;
> -
> - if (!PageCgroupUsed(pc))
> - goto out;
> -
> - if (pc->mem_cgroup != from)
> - goto out;
> + VM_BUG_ON(!PageCgroupLocked(pc));
> + VM_BUG_ON(!PageCgroupUsed(pc));
> + VM_BUG_ON(pc->mem_cgroup != from);
>
> if (!mem_cgroup_is_root(from))
> res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
> @@ -1554,15 +1538,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
> css_get(&to->css);
> pc->mem_cgroup = to;
> mem_cgroup_charge_statistics(to, pc, true);
> - ret = 0;
> -out:
> - unlock_page_cgroup(pc);
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> * can be under rmdir(). But in current implementation, caller of
> * this function is just force_empty() and it's garanteed that
> * "to" is never removed. So, we don't check rmdir status here.
> */
> +}
> +
> +/*
> + * check whether the @pc is valid for account move and call
> + * __mem_cgroup_move_account()
> + */
> +static int mem_cgroup_move_account(struct page_cgroup *pc,
> + struct mem_cgroup *from, struct mem_cgroup *to)
> +{
> + int ret = -EINVAL;
> + lock_page_cgroup(pc);
> + if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
> + __mem_cgroup_move_account(pc, from, to);
> + ret = 0;
> + }
> + unlock_page_cgroup(pc);
> return ret;
> }
>
> @@ -1584,38 +1581,35 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
> if (!pcg)
> return -EINVAL;
>
> + ret = -EBUSY;
> + if (!get_page_unless_zero(page))
> + goto out;
> + if (isolate_lru_page(page))
> + goto put;
>
> - parent = mem_cgroup_from_cont(pcg);
> -
> + ret = -EINVAL;
> + lock_page_cgroup(pc);
> + if (!PageCgroupUsed(pc) || pc->mem_cgroup != child) { /* early check */
> + unlock_page_cgroup(pc);
> + goto put_back;
> + }
I wonder...it's ok to remove this check. We'll do later and
racy case will be often. Then, the codes will be simpler.
Any ideas ?
Thanks,
-Kame
> + unlock_page_cgroup(pc);
>
> + parent = mem_cgroup_from_cont(pcg);
> ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
> if (ret || !parent)
> - return ret;
> -
> - if (!get_page_unless_zero(page)) {
> - ret = -EBUSY;
> - goto uncharge;
> - }
> -
> - ret = isolate_lru_page(page);
> -
> - if (ret)
> - goto cancel;
> + goto put_back;
>
> ret = mem_cgroup_move_account(pc, child, parent);
> -
> + if (!ret)
> + css_put(&parent->css); /* drop extra refcnt by try_charge() */
> + else
> + __mem_cgroup_cancel_charge(parent); /* does css_put */
> +put_back:
> putback_lru_page(page);
> - if (!ret) {
> - put_page(page);
> - /* drop extra refcnt by try_charge() */
> - css_put(&parent->css);
> - return 0;
> - }
> -
> -cancel:
> +put:
> put_page(page);
> -uncharge:
> - __mem_cgroup_cancel_charge(parent);
> +out:
> return ret;
> }
>
> --
> 1.5.6.1
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent()
2009-09-24 6:37 ` KAMEZAWA Hiroyuki
@ 2009-09-24 6:54 ` Daisuke Nishimura
0 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 6:54 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
> > @@ -1584,38 +1581,35 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
> > if (!pcg)
> > return -EINVAL;
> >
> > + ret = -EBUSY;
> > + if (!get_page_unless_zero(page))
> > + goto out;
> > + if (isolate_lru_page(page))
> > + goto put;
> >
> > - parent = mem_cgroup_from_cont(pcg);
> > -
> > + ret = -EINVAL;
> > + lock_page_cgroup(pc);
> > + if (!PageCgroupUsed(pc) || pc->mem_cgroup != child) { /* early check */
> > + unlock_page_cgroup(pc);
> > + goto put_back;
> > + }
>
> I wonder...it's ok to remove this check. We'll do later and
> racy case will be often. Then, the codes will be simpler.
> Any ideas ?
>
Yes, it can be removed. mem_cgroup_move_account() called later will check it again.
It's just an early check to avoid try_charge() if possible, but it's O.K.
for me to remove it.
will fix in next post.
Thanks,
Daisuke Nishimura.
> Thanks,
> -Kame
>
> > + unlock_page_cgroup(pc);
> >
> > + parent = mem_cgroup_from_cont(pcg);
> > ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
> > if (ret || !parent)
> > - return ret;
> > -
> > - if (!get_page_unless_zero(page)) {
> > - ret = -EBUSY;
> > - goto uncharge;
> > - }
> > -
> > - ret = isolate_lru_page(page);
> > -
> > - if (ret)
> > - goto cancel;
> > + goto put_back;
> >
> > ret = mem_cgroup_move_account(pc, child, parent);
> > -
> > + if (!ret)
> > + css_put(&parent->css); /* drop extra refcnt by try_charge() */
> > + else
> > + __mem_cgroup_cancel_charge(parent); /* does css_put */
> > +put_back:
> > putback_lru_page(page);
> > - if (!ret) {
> > - put_page(page);
> > - /* drop extra refcnt by try_charge() */
> > - css_put(&parent->css);
> > - return 0;
> > - }
> > -
> > -cancel:
> > +put:
> > put_page(page);
> > -uncharge:
> > - __mem_cgroup_cancel_charge(parent);
> > +out:
> > return ret;
> > }
> >
> > --
> > 1.5.6.1
> >
> >
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [RFC][PATCH 4/8] memcg: add interface to migrate charge
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
` (2 preceding siblings ...)
2009-09-24 5:46 ` [RFC][PATCH 3/8] memcg: cleanup mem_cgroup_move_parent() Daisuke Nishimura
@ 2009-09-24 5:47 ` Daisuke Nishimura
2009-09-24 6:54 ` KAMEZAWA Hiroyuki
2009-09-24 5:48 ` [RFC][PATCH 5/8] memcg: migrate charge of mapped page Daisuke Nishimura
` (3 subsequent siblings)
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:47 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch adds "memory.migrate_charge" file and handlers of it.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e8874d..30499d9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -225,6 +225,8 @@ struct mem_cgroup {
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
+ bool migrate_charge;
+
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -2843,6 +2845,27 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return 0;
}
+static u64 mem_cgroup_migrate_charge_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_cont(cgrp)->migrate_charge;
+}
+
+static int mem_cgroup_migrate_charge_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ cgroup_lock();
+ mem->migrate_charge = val;
+ cgroup_unlock();
+
+ return 0;
+}
+
static struct cftype mem_cgroup_files[] = {
{
@@ -2892,6 +2915,11 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
+ {
+ .name = "migrate_charge",
+ .read_u64 = mem_cgroup_migrate_charge_read,
+ .write_u64 = mem_cgroup_migrate_charge_write,
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3132,6 +3160,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (parent)
mem->swappiness = get_swappiness(parent);
atomic_set(&mem->refcnt, 1);
+ mem->migrate_charge = 0;
return &mem->css;
free_out:
__mem_cgroup_free(mem);
@@ -3168,6 +3197,35 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret;
}
+static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
+ struct task_struct *p)
+{
+ return 0;
+}
+
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct task_struct *p,
+ bool threadgroup)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ if (mem->migrate_charge && thread_group_leader(p))
+ return mem_cgroup_can_migrate_charge(mem, p);
+ return 0;
+}
+
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct task_struct *p,
+ bool threadgroup)
+{
+}
+
+static void mem_cgroup_migrate_charge(void)
+{
+}
+
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
@@ -3175,10 +3233,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
bool threadgroup)
{
mutex_lock(&memcg_tasklist);
- /*
- * FIXME: It's better to move charges of this process from old
- * memcg to new memcg. But it's just on TODO-List now.
- */
+ mem_cgroup_migrate_charge();
mutex_unlock(&memcg_tasklist);
}
@@ -3189,6 +3244,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.early_init = 0,
.use_id = 1,
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 4/8] memcg: add interface to migrate charge
2009-09-24 5:47 ` [RFC][PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
@ 2009-09-24 6:54 ` KAMEZAWA Hiroyuki
2009-09-24 23:39 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 6:54 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:47:18 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch adds "memory.migrate_charge" file and handlers of it.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> 1 files changed, 61 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7e8874d..30499d9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -225,6 +225,8 @@ struct mem_cgroup {
> /* set when res.limit == memsw.limit */
> bool memsw_is_minimum;
>
> + bool migrate_charge;
> +
> /*
> * statistics. This must be placed at the end of memcg.
> */
> @@ -2843,6 +2845,27 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> return 0;
> }
>
> +static u64 mem_cgroup_migrate_charge_read(struct cgroup *cgrp,
> + struct cftype *cft)
> +{
> + return mem_cgroup_from_cont(cgrp)->migrate_charge;
> +}
> +
> +static int mem_cgroup_migrate_charge_write(struct cgroup *cgrp,
> + struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
> +
> + if (val != 0 && val != 1)
> + return -EINVAL;
> +
> + cgroup_lock();
> + mem->migrate_charge = val;
> + cgroup_unlock();
> +
> + return 0;
> +}
Do we need cgroup_lock() here ?
Is this lock agaisnt race with attach() ?
If so, adding commentary is better.
BTW, migrate_charge is an ambiguous name.
Does migrate_charge mean
(1) When a task is migrated "into" this cgroup, charges of that
will be moved from the orignal cgroup ?
(2) When a task is migrated "from" this cgroup, charges of that
will be moved to a destination cgroup ?
And I don't like using word as "migrate" here because it is associated
with page-migration ;)
If you don't mind, how about "recharge_at_immigrate" or some ?
(But I believe there will be better words....)
Thanks,
-Kame
> static struct cftype mem_cgroup_files[] = {
> {
> @@ -2892,6 +2915,11 @@ static struct cftype mem_cgroup_files[] = {
> .read_u64 = mem_cgroup_swappiness_read,
> .write_u64 = mem_cgroup_swappiness_write,
> },
> + {
> + .name = "migrate_charge",
> + .read_u64 = mem_cgroup_migrate_charge_read,
> + .write_u64 = mem_cgroup_migrate_charge_write,
> + },
> };
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> @@ -3132,6 +3160,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> if (parent)
> mem->swappiness = get_swappiness(parent);
> atomic_set(&mem->refcnt, 1);
> + mem->migrate_charge = 0;
> return &mem->css;
> free_out:
> __mem_cgroup_free(mem);
> @@ -3168,6 +3197,35 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> return ret;
> }
>
> +static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> + struct task_struct *p)
> +{
> + return 0;
> +}
> +
> +static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> + struct cgroup *cont,
> + struct task_struct *p,
> + bool threadgroup)
> +{
> + struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +
> + if (mem->migrate_charge && thread_group_leader(p))
> + return mem_cgroup_can_migrate_charge(mem, p);
> + return 0;
> +}
> +
> +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> + struct cgroup *cont,
> + struct task_struct *p,
> + bool threadgroup)
> +{
> +}
> +
> +static void mem_cgroup_migrate_charge(void)
> +{
> +}
> +
> static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> struct cgroup *cont,
> struct cgroup *old_cont,
> @@ -3175,10 +3233,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> bool threadgroup)
> {
> mutex_lock(&memcg_tasklist);
> - /*
> - * FIXME: It's better to move charges of this process from old
> - * memcg to new memcg. But it's just on TODO-List now.
> - */
> + mem_cgroup_migrate_charge();
> mutex_unlock(&memcg_tasklist);
> }
>
> @@ -3189,6 +3244,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
> .pre_destroy = mem_cgroup_pre_destroy,
> .destroy = mem_cgroup_destroy,
> .populate = mem_cgroup_populate,
> + .can_attach = mem_cgroup_can_attach,
> + .cancel_attach = mem_cgroup_cancel_attach,
> .attach = mem_cgroup_move_task,
> .early_init = 0,
> .use_id = 1,
> --
> 1.5.6.1
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 4/8] memcg: add interface to migrate charge
2009-09-24 6:54 ` KAMEZAWA Hiroyuki
@ 2009-09-24 23:39 ` Daisuke Nishimura
0 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 23:39 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 15:54:59 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 24 Sep 2009 14:47:18 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch adds "memory.migrate_charge" file and handlers of it.
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> > mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> > 1 files changed, 61 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 7e8874d..30499d9 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -225,6 +225,8 @@ struct mem_cgroup {
> > /* set when res.limit == memsw.limit */
> > bool memsw_is_minimum;
> >
> > + bool migrate_charge;
> > +
> > /*
> > * statistics. This must be placed at the end of memcg.
> > */
> > @@ -2843,6 +2845,27 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> > return 0;
> > }
> >
> > +static u64 mem_cgroup_migrate_charge_read(struct cgroup *cgrp,
> > + struct cftype *cft)
> > +{
> > + return mem_cgroup_from_cont(cgrp)->migrate_charge;
> > +}
> > +
> > +static int mem_cgroup_migrate_charge_write(struct cgroup *cgrp,
> > + struct cftype *cft, u64 val)
> > +{
> > + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
> > +
> > + if (val != 0 && val != 1)
> > + return -EINVAL;
> > +
> > + cgroup_lock();
> > + mem->migrate_charge = val;
> > + cgroup_unlock();
> > +
> > + return 0;
> > +}
>
> Do we need cgroup_lock() here ?
> Is this lock agaisnt race with attach() ?
> If so, adding commentary is better.
>
I thought so..., but, considering more, it would be unnecessary because we check it
only once in mem_cgroup_can_attach() in current implementation.
I'll add some comments anyway.
> BTW, migrate_charge is an ambiguous name.
>
> Does migrate_charge mean
> (1) When a task is migrated "into" this cgroup, charges of that
> will be moved from the orignal cgroup ?
> (2) When a task is migrated "from" this cgroup, charges of that
> will be moved to a destination cgroup ?
>
> And I don't like using word as "migrate" here because it is associated
> with page-migration ;)
>
Agreed.
> If you don't mind, how about "recharge_at_immigrate" or some ?
> (But I believe there will be better words....)
>
Seems good for me.
Thank you for your suggestion.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [RFC][PATCH 5/8] memcg: migrate charge of mapped page
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
` (3 preceding siblings ...)
2009-09-24 5:47 ` [RFC][PATCH 4/8] memcg: add interface to migrate charge Daisuke Nishimura
@ 2009-09-24 5:48 ` Daisuke Nishimura
2009-09-24 7:22 ` KAMEZAWA Hiroyuki
2009-09-24 5:49 ` [RFC][PATCH 6/8] memcg: avoid oom during charge migration Daisuke Nishimura
` (2 subsequent siblings)
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:48 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch is the core part of this charge migration feature.
It adds functions to migrate charge of pages mapped by the task.
Implementation:
- define struct migrate_charge and a valuable of it(mc) to remember the count
of pre-charges and other information.
- At can_attach(), parse the page table of the task and count the number of
mapped pages which are charged to the source mem_cgroup, and call
__mem_cgroup_try_charge() repeatedly and count up mc->precharge.
- At attach(), parse the page table again, find a target page as we did in
can_attach(), and call mem_cgroup_move_account() about the page.
- Cancel all charges if mc->precharge > 0 on failure or at the end of charge
migration.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 268 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30499d9..fbcc195 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,8 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -274,6 +276,18 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+/*
+ * Variables for charge migration at task move.
+ * mc and its members are protected by cgroup_lock
+ */
+struct migrate_charge {
+ struct task_struct *tsk;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ unsigned long precharge;
+};
+static struct migrate_charge *mc;
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
@@ -1362,7 +1376,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (soft_fail_res) {
mem_over_soft_limit =
mem_cgroup_from_res_counter(soft_fail_res, res);
- if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
+ if (page && mem_cgroup_soft_limit_check(mem_over_soft_limit))
mem_cgroup_update_tree(mem_over_soft_limit, page);
}
done:
@@ -3197,10 +3211,167 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
return ret;
}
+/* Handlers for charge migration at task move. */
+/**
+ * is_target_pte_for_migration - check a pte whether it is target for migration
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page will be stored(can be NULL)
+ *
+ * Returns
+ * 0(MIGRATION_TARGET_NONE): if the pte is not a target for charge migration.
+ * 1(MIGRATION_TARGET_PAGE): if the page corresponding to this pte is a target
+ * for charge migration. if @target is not NULL, the page is stored in
+ * target->page with extra refcnt got(Callers should handle it).
+ *
+ * Called with pte lock held.
+ */
+union migration_target {
+ struct page *page;
+};
+
+enum migration_target_type {
+ MIGRATION_TARGET_NONE, /* not used */
+ MIGRATION_TARGET_PAGE,
+};
+
+static int is_target_pte_for_migration(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, union migration_target *target)
+{
+ struct page *page;
+ struct page_cgroup *pc;
+ int ret = 0;
+
+ if (!pte_present(ptent))
+ return 0;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page || !page_mapped(page))
+ return 0;
+ if (!get_page_unless_zero(page))
+ return 0;
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
+ ret = MIGRATION_TARGET_PAGE;
+ if (target)
+ target->page = page;
+ }
+ unlock_page_cgroup(pc);
+
+ if (!ret || !target)
+ put_page(page);
+
+ return ret;
+}
+
+static int migrate_charge_do_precharge(void)
+{
+ int ret = -ENOMEM;
+ struct mem_cgroup *mem = mc->to;
+
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL);
+ if (ret || !mem)
+ return -ENOMEM;
+
+ mc->precharge++;
+ return ret;
+}
+
+static int migrate_charge_prepare_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ unsigned long count = 0;
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ if (is_target_pte_for_migration(vma, addr, *pte, NULL))
+ count++;
+ pte_unmap_unlock(pte - 1, ptl);
+
+ while (count-- && !ret)
+ ret = migrate_charge_do_precharge();
+
+ return ret;
+}
+
+static int migrate_charge_prepare(void)
+{
+ int ret = 0;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ mm = get_task_mm(mc->tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct mm_walk migrate_charge_prepare_walk = {
+ .pmd_entry = migrate_charge_prepare_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ ret = walk_page_range(vma->vm_start, vma->vm_end,
+ &migrate_charge_prepare_walk);
+ if (ret)
+ break;
+ cond_resched();
+ }
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+ return ret;
+}
+
+static void mem_cgroup_clear_migrate_charge(void)
+{
+ VM_BUG_ON(!mc);
+
+ while (mc->precharge--)
+ __mem_cgroup_cancel_charge(mc->to);
+ kfree(mc);
+ mc = NULL;
+}
+
static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
struct task_struct *p)
{
- return 0;
+ int ret;
+ struct mem_cgroup *from = mem_cgroup_from_task(p);
+
+ VM_BUG_ON(mc);
+
+ if (from == mem)
+ return 0;
+
+ mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ mc->tsk = p;
+ mc->from = from;
+ mc->to = mem;
+ mc->precharge = 0;
+
+ ret = migrate_charge_prepare();
+
+ if (ret)
+ mem_cgroup_clear_migrate_charge();
+ return ret;
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -3220,10 +3391,105 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
+ if (mc)
+ mem_cgroup_clear_migrate_charge();
+}
+
+static int migrate_charge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+retry:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; addr += PAGE_SIZE) {
+ pte_t ptent = *(pte++);
+ union migration_target target;
+ int type;
+ struct page *page;
+ struct page_cgroup *pc;
+
+ if (!mc->precharge)
+ break;
+
+ type = is_target_pte_for_migration(vma, addr, ptent, &target);
+ switch (type) {
+ case MIGRATION_TARGET_PAGE:
+ page = target.page;
+ if (isolate_lru_page(page))
+ goto put;
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(pc, mc->from, mc->to)) {
+ css_put(&mc->to->css);
+ mc->precharge--;
+ }
+ putback_lru_page(page);
+put: /* is_target_pte_for_migration() gets the page */
+ put_page(page);
+ break;
+ default:
+ continue;
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+
+ if (addr != end) {
+ /*
+ * We have consumed all precharges we got in can_attach().
+ * We try precharge one by one, but don't do any additional
+ * precharges nor charge migration if we have failed in
+ * precharge once in attach() phase.
+ */
+ ret = migrate_charge_do_precharge();
+ if (!ret)
+ goto retry;
+ }
+
+ return ret;
}
static void mem_cgroup_migrate_charge(void)
{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ if (!mc)
+ return;
+
+ mm = get_task_mm(mc->tsk);
+ if (!mm)
+ goto out;
+
+ lru_add_drain_all();
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ int ret;
+ struct mm_walk migrate_charge_walk = {
+ .pmd_entry = migrate_charge_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ ret = walk_page_range(vma->vm_start, vma->vm_end,
+ &migrate_charge_walk);
+ if (ret)
+ /*
+ * means we have consumed all precharges and failed in
+ * doing additional precharge. Just abandon here.
+ */
+ break;
+ cond_resched();
+ }
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+out:
+ mem_cgroup_clear_migrate_charge();
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 5/8] memcg: migrate charge of mapped page
2009-09-24 5:48 ` [RFC][PATCH 5/8] memcg: migrate charge of mapped page Daisuke Nishimura
@ 2009-09-24 7:22 ` KAMEZAWA Hiroyuki
2009-09-24 8:00 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 7:22 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:48:08 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch is the core part of this charge migration feature.
> It adds functions to migrate charge of pages mapped by the task.
>
> Implementation:
> - define struct migrate_charge and a valuable of it(mc) to remember the count
> of pre-charges and other information.
> - At can_attach(), parse the page table of the task and count the number of
> mapped pages which are charged to the source mem_cgroup, and call
> __mem_cgroup_try_charge() repeatedly and count up mc->precharge.
> - At attach(), parse the page table again, find a target page as we did in
> can_attach(), and call mem_cgroup_move_account() about the page.
> - Cancel all charges if mc->precharge > 0 on failure or at the end of charge
> migration.
>
At first, thank you for hearing my request :)
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> mm/memcontrol.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 268 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 30499d9..fbcc195 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -21,6 +21,8 @@
> #include <linux/memcontrol.h>
> #include <linux/cgroup.h>
> #include <linux/mm.h>
> +#include <linux/migrate.h>
> +#include <linux/hugetlb.h>
> #include <linux/pagemap.h>
> #include <linux/smp.h>
> #include <linux/page-flags.h>
> @@ -274,6 +276,18 @@ enum charge_type {
> #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
>
> +/*
> + * Variables for charge migration at task move.
> + * mc and its members are protected by cgroup_lock
> + */
> +struct migrate_charge {
> + struct task_struct *tsk;
> + struct mem_cgroup *from;
> + struct mem_cgroup *to;
> + unsigned long precharge;
> +};
> +static struct migrate_charge *mc;
> +
I associate migrate with "page migration".
Then, hmm, recharge or move_charge or some other good word, I like.
BTW, why "mc" is a global vairable ?
IIUC, this all migration is done under cgroup_lock, mc can be
global variable...right ?
But ah..ok, this is a similar thing around cpuset's migration..
When you removing RFC, I'd like to see some performance cost
information around this recharge..
> static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> @@ -1362,7 +1376,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> if (soft_fail_res) {
> mem_over_soft_limit =
> mem_cgroup_from_res_counter(soft_fail_res, res);
> - if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
> + if (page && mem_cgroup_soft_limit_check(mem_over_soft_limit))
> mem_cgroup_update_tree(mem_over_soft_limit, page);
> }
> done:
> @@ -3197,10 +3211,167 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> return ret;
> }
>
> +/* Handlers for charge migration at task move. */
> +/**
> + * is_target_pte_for_migration - check a pte whether it is target for migration
> + * @vma: the vma the pte to be checked belongs
> + * @addr: the address corresponding to the pte to be checked
> + * @ptent: the pte to be checked
> + * @target: the pointer the target page will be stored(can be NULL)
> + *
> + * Returns
> + * 0(MIGRATION_TARGET_NONE): if the pte is not a target for charge migration.
> + * 1(MIGRATION_TARGET_PAGE): if the page corresponding to this pte is a target
> + * for charge migration. if @target is not NULL, the page is stored in
> + * target->page with extra refcnt got(Callers should handle it).
Will these type incrase more ? If not, bool value is enough.
> + *
> + * Called with pte lock held.
> + */
> +union migration_target {
> + struct page *page;
> +};
> +
> +enum migration_target_type {
> + MIGRATION_TARGET_NONE, /* not used */
> + MIGRATION_TARGET_PAGE,
> +};
> +
> +static int is_target_pte_for_migration(struct vm_area_struct *vma,
> + unsigned long addr, pte_t ptent, union migration_target *target)
> +{
> + struct page *page;
> + struct page_cgroup *pc;
> + int ret = 0;
> +
> + if (!pte_present(ptent))
> + return 0;
> +
> + page = vm_normal_page(vma, addr, ptent);
> + if (!page || !page_mapped(page))
> + return 0;
> + if (!get_page_unless_zero(page))
> + return 0;
Is this necessary ? We're udner page table lock.
Then, no one can unmap this.
> +
> + pc = lookup_page_cgroup(page);
> + lock_page_cgroup(pc);
Hmm...we may even avoid this lock. sounds tricy but we're
under page_table_lock, no one can numap this.
> + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
> + ret = MIGRATION_TARGET_PAGE;
> + if (target)
> + target->page = page;
> + }
> + unlock_page_cgroup(pc);
> +
> + if (!ret || !target)
> + put_page(page);
> +
> + return ret;
> +}
> +
> +static int migrate_charge_do_precharge(void)
> +{
> + int ret = -ENOMEM;
> + struct mem_cgroup *mem = mc->to;
> +
> + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL);
> + if (ret || !mem)
> + return -ENOMEM;
> +
> + mc->precharge++;
> + return ret;
> +}
> +
> +static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + int ret = 0;
> + unsigned long count = 0;
> + struct vm_area_struct *vma = walk->private;
> + pte_t *pte;
> + spinlock_t *ptl;
> +
> + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> + for (; addr != end; pte++, addr += PAGE_SIZE)
> + if (is_target_pte_for_migration(vma, addr, *pte, NULL))
> + count++;
> + pte_unmap_unlock(pte - 1, ptl);
> +
> + while (count-- && !ret)
> + ret = migrate_charge_do_precharge();
> +
> + return ret;
> +}
> +
> +static int migrate_charge_prepare(void)
> +{
> + int ret = 0;
> + struct mm_struct *mm;
> + struct vm_area_struct *vma;
> +
> + mm = get_task_mm(mc->tsk);
> + if (!mm)
> + return 0;
> +
> + down_read(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + struct mm_walk migrate_charge_prepare_walk = {
> + .pmd_entry = migrate_charge_prepare_pte_range,
> + .mm = mm,
> + .private = vma,
> + };
> + if (signal_pending(current)) {
> + ret = -EINTR;
> + break;
> + }
> + if (is_vm_hugetlb_page(vma))
> + continue;
> + ret = walk_page_range(vma->vm_start, vma->vm_end,
> + &migrate_charge_prepare_walk);
> + if (ret)
> + break;
> + cond_resched();
> + }
> + up_read(&mm->mmap_sem);
> +
> + mmput(mm);
> + return ret;
> +}
> +
> +static void mem_cgroup_clear_migrate_charge(void)
> +{
> + VM_BUG_ON(!mc);
> +
> + while (mc->precharge--)
> + __mem_cgroup_cancel_charge(mc->to);
> + kfree(mc);
> + mc = NULL;
> +}
> +
> static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> struct task_struct *p)
> {
> - return 0;
> + int ret;
> + struct mem_cgroup *from = mem_cgroup_from_task(p);
> +
> + VM_BUG_ON(mc);
> +
> + if (from == mem)
> + return 0;
> +
please VM_BUG_ON(!mc);
> + mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
> + if (!mc)
> + return -ENOMEM;
> +
> + mc->tsk = p;
> + mc->from = from;
> + mc->to = mem;
> + mc->precharge = 0;
> +
> + ret = migrate_charge_prepare();
> +
> + if (ret)
> + mem_cgroup_clear_migrate_charge();
> + return ret;
> }
>
> static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> @@ -3220,10 +3391,105 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> struct task_struct *p,
> bool threadgroup)
> {
> + if (mc)
> + mem_cgroup_clear_migrate_charge();
> +}
> +
> +static int migrate_charge_pte_range(pmd_t *pmd,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + int ret = 0;
> + struct vm_area_struct *vma = walk->private;
> + pte_t *pte;
> + spinlock_t *ptl;
> +
> +retry:
> + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> + for (; addr != end; addr += PAGE_SIZE) {
> + pte_t ptent = *(pte++);
> + union migration_target target;
> + int type;
> + struct page *page;
> + struct page_cgroup *pc;
> +
> + if (!mc->precharge)
> + break;
> +
> + type = is_target_pte_for_migration(vma, addr, ptent, &target);
> + switch (type) {
> + case MIGRATION_TARGET_PAGE:
> + page = target.page;
> + if (isolate_lru_page(page))
> + goto put;
> + pc = lookup_page_cgroup(page);
> + if (!mem_cgroup_move_account(pc, mc->from, mc->to)) {
> + css_put(&mc->to->css);
> + mc->precharge--;
> + }
> + putback_lru_page(page);
> +put: /* is_target_pte_for_migration() gets the page */
> + put_page(page);
> + break;
> + default:
> + continue;
> + }
> + }
> + pte_unmap_unlock(pte - 1, ptl);
> +
> + if (addr != end) {
> + /*
> + * We have consumed all precharges we got in can_attach().
> + * We try precharge one by one, but don't do any additional
> + * precharges nor charge migration if we have failed in
> + * precharge once in attach() phase.
> + */
> + ret = migrate_charge_do_precharge();
> + if (!ret)
> + goto retry;
I think this is a nice handling.
> + }
> +
> + return ret;
> }
>
> static void mem_cgroup_migrate_charge(void)
> {
> + struct mm_struct *mm;
> + struct vm_area_struct *vma;
> +
> + if (!mc)
> + return;
> +
> + mm = get_task_mm(mc->tsk);
> + if (!mm)
> + goto out;
> +
> + lru_add_drain_all();
> + down_read(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + int ret;
> + struct mm_walk migrate_charge_walk = {
> + .pmd_entry = migrate_charge_pte_range,
> + .mm = mm,
> + .private = vma,
> + };
> + if (is_vm_hugetlb_page(vma))
> + continue;
> + ret = walk_page_range(vma->vm_start, vma->vm_end,
> + &migrate_charge_walk);
> + if (ret)
> + /*
> + * means we have consumed all precharges and failed in
> + * doing additional precharge. Just abandon here.
> + */
> + break;
> + cond_resched();
> + }
> + up_read(&mm->mmap_sem);
> +
> + mmput(mm);
> +out:
> + mem_cgroup_clear_migrate_charge();
> }
>
> static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> --
Hmm, I don't complain to this patch itself but cgroup_lock() will be the
last wall to be overcomed for production use...
Can't we just prevent rmdir/mkdir on a hierarchy and move a task ?
fork() etc..can be stopped by this and cpuset's code is not very good.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 5/8] memcg: migrate charge of mapped page
2009-09-24 7:22 ` KAMEZAWA Hiroyuki
@ 2009-09-24 8:00 ` Daisuke Nishimura
2009-09-25 0:28 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 8:00 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 16:22:26 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 24 Sep 2009 14:48:08 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch is the core part of this charge migration feature.
> > It adds functions to migrate charge of pages mapped by the task.
> >
> > Implementation:
> > - define struct migrate_charge and a valuable of it(mc) to remember the count
> > of pre-charges and other information.
> > - At can_attach(), parse the page table of the task and count the number of
> > mapped pages which are charged to the source mem_cgroup, and call
> > __mem_cgroup_try_charge() repeatedly and count up mc->precharge.
> > - At attach(), parse the page table again, find a target page as we did in
> > can_attach(), and call mem_cgroup_move_account() about the page.
> > - Cancel all charges if mc->precharge > 0 on failure or at the end of charge
> > migration.
> >
>
> At first, thank you for hearing my request :)
>
>
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> > mm/memcontrol.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 files changed, 268 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 30499d9..fbcc195 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -21,6 +21,8 @@
> > #include <linux/memcontrol.h>
> > #include <linux/cgroup.h>
> > #include <linux/mm.h>
> > +#include <linux/migrate.h>
> > +#include <linux/hugetlb.h>
> > #include <linux/pagemap.h>
> > #include <linux/smp.h>
> > #include <linux/page-flags.h>
> > @@ -274,6 +276,18 @@ enum charge_type {
> > #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> > #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
> >
> > +/*
> > + * Variables for charge migration at task move.
> > + * mc and its members are protected by cgroup_lock
> > + */
> > +struct migrate_charge {
> > + struct task_struct *tsk;
> > + struct mem_cgroup *from;
> > + struct mem_cgroup *to;
> > + unsigned long precharge;
> > +};
> > +static struct migrate_charge *mc;
> > +
>
> I associate migrate with "page migration".
> Then, hmm, recharge or move_charge or some other good word, I like.
>
O.K.
Will change.
> BTW, why "mc" is a global vairable ?
>
Just because __mem_cgroup_try_charge() will use it in next patch.
> IIUC, this all migration is done under cgroup_lock, mc can be
> global variable...right ?
>
Yes.
> But ah..ok, this is a similar thing around cpuset's migration..
> When you removing RFC, I'd like to see some performance cost
> information around this recharge..
>
>
>
> > static void mem_cgroup_get(struct mem_cgroup *mem);
> > static void mem_cgroup_put(struct mem_cgroup *mem);
> > static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> > @@ -1362,7 +1376,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > if (soft_fail_res) {
> > mem_over_soft_limit =
> > mem_cgroup_from_res_counter(soft_fail_res, res);
> > - if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
> > + if (page && mem_cgroup_soft_limit_check(mem_over_soft_limit))
> > mem_cgroup_update_tree(mem_over_soft_limit, page);
> > }
> > done:
> > @@ -3197,10 +3211,167 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
> > return ret;
> > }
> >
> > +/* Handlers for charge migration at task move. */
> > +/**
> > + * is_target_pte_for_migration - check a pte whether it is target for migration
> > + * @vma: the vma the pte to be checked belongs
> > + * @addr: the address corresponding to the pte to be checked
> > + * @ptent: the pte to be checked
> > + * @target: the pointer the target page will be stored(can be NULL)
> > + *
> > + * Returns
> > + * 0(MIGRATION_TARGET_NONE): if the pte is not a target for charge migration.
> > + * 1(MIGRATION_TARGET_PAGE): if the page corresponding to this pte is a target
> > + * for charge migration. if @target is not NULL, the page is stored in
> > + * target->page with extra refcnt got(Callers should handle it).
>
> Will these type incrase more ? If not, bool value is enough.
>
Yes.
Please see [7/8] :)
>
>
> > + *
> > + * Called with pte lock held.
> > + */
> > +union migration_target {
> > + struct page *page;
> > +};
> > +
> > +enum migration_target_type {
> > + MIGRATION_TARGET_NONE, /* not used */
> > + MIGRATION_TARGET_PAGE,
> > +};
> > +
> > +static int is_target_pte_for_migration(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t ptent, union migration_target *target)
> > +{
> > + struct page *page;
> > + struct page_cgroup *pc;
> > + int ret = 0;
> > +
> > + if (!pte_present(ptent))
> > + return 0;
> > +
> > + page = vm_normal_page(vma, addr, ptent);
> > + if (!page || !page_mapped(page))
> > + return 0;
> > + if (!get_page_unless_zero(page))
> > + return 0;
>
> Is this necessary ? We're udner page table lock.
> Then, no one can unmap this.
>
I get the page just because to keep consistency after [7/8].
[7/8] calls find_get_page(&swapper_space, ent.val).
> > +
> > + pc = lookup_page_cgroup(page);
> > + lock_page_cgroup(pc);
>
> Hmm...we may even avoid this lock. sounds tricy but we're
> under page_table_lock, no one can numap this.
>
>
But I think it's not good behavior to check PageCgroupUsed() and pc->mem_cgroup below.
And I'll handle not mapped pages(swap cache) too in later patch.
> > + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
> > + ret = MIGRATION_TARGET_PAGE;
> > + if (target)
> > + target->page = page;
> > + }
> > + unlock_page_cgroup(pc);
> > +
> > + if (!ret || !target)
> > + put_page(page);
> > +
> > + return ret;
> > +}
> > +
> > +static int migrate_charge_do_precharge(void)
> > +{
> > + int ret = -ENOMEM;
> > + struct mem_cgroup *mem = mc->to;
>
>
>
> > +
> > + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL);
> > + if (ret || !mem)
> > + return -ENOMEM;
> > +
> > + mc->precharge++;
> > + return ret;
> > +}
> > +
> > +static int migrate_charge_prepare_pte_range(pmd_t *pmd,
> > + unsigned long addr, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + int ret = 0;
> > + unsigned long count = 0;
> > + struct vm_area_struct *vma = walk->private;
> > + pte_t *pte;
> > + spinlock_t *ptl;
> > +
> > + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > + for (; addr != end; pte++, addr += PAGE_SIZE)
> > + if (is_target_pte_for_migration(vma, addr, *pte, NULL))
> > + count++;
> > + pte_unmap_unlock(pte - 1, ptl);
> > +
> > + while (count-- && !ret)
> > + ret = migrate_charge_do_precharge();
> > +
> > + return ret;
> > +}
> > +
> > +static int migrate_charge_prepare(void)
> > +{
> > + int ret = 0;
> > + struct mm_struct *mm;
> > + struct vm_area_struct *vma;
> > +
> > + mm = get_task_mm(mc->tsk);
> > + if (!mm)
> > + return 0;
> > +
> > + down_read(&mm->mmap_sem);
> > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > + struct mm_walk migrate_charge_prepare_walk = {
> > + .pmd_entry = migrate_charge_prepare_pte_range,
> > + .mm = mm,
> > + .private = vma,
> > + };
> > + if (signal_pending(current)) {
> > + ret = -EINTR;
> > + break;
> > + }
> > + if (is_vm_hugetlb_page(vma))
> > + continue;
> > + ret = walk_page_range(vma->vm_start, vma->vm_end,
> > + &migrate_charge_prepare_walk);
> > + if (ret)
> > + break;
> > + cond_resched();
> > + }
> > + up_read(&mm->mmap_sem);
> > +
> > + mmput(mm);
> > + return ret;
> > +}
> > +
> > +static void mem_cgroup_clear_migrate_charge(void)
> > +{
> > + VM_BUG_ON(!mc);
> > +
> > + while (mc->precharge--)
> > + __mem_cgroup_cancel_charge(mc->to);
> > + kfree(mc);
> > + mc = NULL;
> > +}
> > +
> > static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> > struct task_struct *p)
> > {
> > - return 0;
> > + int ret;
> > + struct mem_cgroup *from = mem_cgroup_from_task(p);
> > +
> > + VM_BUG_ON(mc);
> > +
> > + if (from == mem)
> > + return 0;
> > +
> please VM_BUG_ON(!mc);
>
hmm?
I think we should not have mc here, and I added VM_BUG_ON(mc) above.
> > + mc = kmalloc(sizeof(struct migrate_charge), GFP_KERNEL);
> > + if (!mc)
> > + return -ENOMEM;
> > +
> > + mc->tsk = p;
> > + mc->from = from;
> > + mc->to = mem;
> > + mc->precharge = 0;
> > +
> > + ret = migrate_charge_prepare();
> > +
> > + if (ret)
> > + mem_cgroup_clear_migrate_charge();
> > + return ret;
> > }
> >
> > static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> > @@ -3220,10 +3391,105 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> > struct task_struct *p,
> > bool threadgroup)
> > {
> > + if (mc)
> > + mem_cgroup_clear_migrate_charge();
> > +}
> > +
> > +static int migrate_charge_pte_range(pmd_t *pmd,
> > + unsigned long addr, unsigned long end,
> > + struct mm_walk *walk)
> > +{
> > + int ret = 0;
> > + struct vm_area_struct *vma = walk->private;
> > + pte_t *pte;
> > + spinlock_t *ptl;
> > +
> > +retry:
> > + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> > + for (; addr != end; addr += PAGE_SIZE) {
> > + pte_t ptent = *(pte++);
> > + union migration_target target;
> > + int type;
> > + struct page *page;
> > + struct page_cgroup *pc;
> > +
> > + if (!mc->precharge)
> > + break;
> > +
> > + type = is_target_pte_for_migration(vma, addr, ptent, &target);
> > + switch (type) {
> > + case MIGRATION_TARGET_PAGE:
> > + page = target.page;
> > + if (isolate_lru_page(page))
> > + goto put;
> > + pc = lookup_page_cgroup(page);
> > + if (!mem_cgroup_move_account(pc, mc->from, mc->to)) {
> > + css_put(&mc->to->css);
> > + mc->precharge--;
> > + }
> > + putback_lru_page(page);
> > +put: /* is_target_pte_for_migration() gets the page */
> > + put_page(page);
> > + break;
> > + default:
> > + continue;
> > + }
> > + }
> > + pte_unmap_unlock(pte - 1, ptl);
> > +
> > + if (addr != end) {
> > + /*
> > + * We have consumed all precharges we got in can_attach().
> > + * We try precharge one by one, but don't do any additional
> > + * precharges nor charge migration if we have failed in
> > + * precharge once in attach() phase.
> > + */
> > + ret = migrate_charge_do_precharge();
> > + if (!ret)
> > + goto retry;
>
> I think this is a nice handling.
>
Thanks :)
>
>
> > + }
> > +
> > + return ret;
> > }
> >
> > static void mem_cgroup_migrate_charge(void)
> > {
> > + struct mm_struct *mm;
> > + struct vm_area_struct *vma;
> > +
> > + if (!mc)
> > + return;
> > +
> > + mm = get_task_mm(mc->tsk);
> > + if (!mm)
> > + goto out;
> > +
> > + lru_add_drain_all();
> > + down_read(&mm->mmap_sem);
> > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > + int ret;
> > + struct mm_walk migrate_charge_walk = {
> > + .pmd_entry = migrate_charge_pte_range,
> > + .mm = mm,
> > + .private = vma,
> > + };
> > + if (is_vm_hugetlb_page(vma))
> > + continue;
> > + ret = walk_page_range(vma->vm_start, vma->vm_end,
> > + &migrate_charge_walk);
> > + if (ret)
> > + /*
> > + * means we have consumed all precharges and failed in
> > + * doing additional precharge. Just abandon here.
> > + */
> > + break;
> > + cond_resched();
> > + }
> > + up_read(&mm->mmap_sem);
> > +
> > + mmput(mm);
> > +out:
> > + mem_cgroup_clear_migrate_charge();
> > }
> >
> > static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> > --
>
> Hmm, I don't complain to this patch itself but cgroup_lock() will be the
> last wall to be overcomed for production use...
>
> Can't we just prevent rmdir/mkdir on a hierarchy and move a task ?
> fork() etc..can be stopped by this and cpuset's code is not very good.
>
I agree.
I must consider more about it.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 5/8] memcg: migrate charge of mapped page
2009-09-24 8:00 ` Daisuke Nishimura
@ 2009-09-25 0:28 ` Daisuke Nishimura
2009-09-25 0:49 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-25 0:28 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
> > Hmm, I don't complain to this patch itself but cgroup_lock() will be the
> > last wall to be overcomed for production use...
> >
> > Can't we just prevent rmdir/mkdir on a hierarchy and move a task ?
> > fork() etc..can be stopped by this and cpuset's code is not very good.
> >
hmm, could you explan more why fork() can be stopped by cgroup_lock(or cgroup_mutex)?
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [RFC][PATCH 5/8] memcg: migrate charge of mapped page
2009-09-25 0:28 ` Daisuke Nishimura
@ 2009-09-25 0:49 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-25 0:49 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Fri, 25 Sep 2009 09:28:37 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > Hmm, I don't complain to this patch itself but cgroup_lock() will be the
> > > last wall to be overcomed for production use...
> > >
> > > Can't we just prevent rmdir/mkdir on a hierarchy and move a task ?
> > > fork() etc..can be stopped by this and cpuset's code is not very good.
> > >
> hmm, could you explan more why fork() can be stopped by cgroup_lock(or cgroup_mutex)?
>
Sorry, I misunderstood cgroup_fork/clone.
Anyway, while doing migration, libcgroup's daemon cannot work for moving
a task to a group.
Thanks,
-Kame
> Thanks,
> Daisuke Nishimura.
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
` (4 preceding siblings ...)
2009-09-24 5:48 ` [RFC][PATCH 5/8] memcg: migrate charge of mapped page Daisuke Nishimura
@ 2009-09-24 5:49 ` Daisuke Nishimura
2009-09-24 7:34 ` KAMEZAWA Hiroyuki
2009-09-24 5:49 ` [RFC][PATCH 7/8] memcg: migrate charge of swap Daisuke Nishimura
2009-09-24 5:50 ` [RFC][PATCH 8/8] memcg: migrate charge of shmem swap Daisuke Nishimura
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:49 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This charge migration feature has double charges on both "from" and "to"
mem_cgroup during charge migration.
This means unnecessary oom can happen because of charge migration.
This patch tries to avoid such oom.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 19 +++++++++++++++++++
1 files changed, 19 insertions(+), 0 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fbcc195..25de11c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -287,6 +287,8 @@ struct migrate_charge {
unsigned long precharge;
};
static struct migrate_charge *mc;
+static struct task_struct *mc_task;
+static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
while (1) {
int ret = 0;
unsigned long flags = 0;
+ DEFINE_WAIT(wait);
if (mem_cgroup_is_root(mem))
goto done;
@@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (mem_cgroup_check_under_limit(mem_over_limit))
continue;
+ /* try to avoid oom while someone is migrating charge */
+ if (mc_task && current != mc_task) {
+ prepare_to_wait(&mc_waitq, &wait, TASK_INTERRUPTIBLE);
+ if (mc) {
+ schedule();
+ finish_wait(&mc_waitq, &wait);
+ continue;
+ }
+ finish_wait(&mc_waitq, &wait);
+ }
+
if (!nr_retries--) {
if (oom) {
mutex_lock(&memcg_tasklist);
@@ -3345,6 +3359,8 @@ static void mem_cgroup_clear_migrate_charge(void)
__mem_cgroup_cancel_charge(mc->to);
kfree(mc);
mc = NULL;
+ mc_task = NULL;
+ wake_up_all(&mc_waitq);
}
static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
@@ -3354,6 +3370,7 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
struct mem_cgroup *from = mem_cgroup_from_task(p);
VM_BUG_ON(mc);
+ VM_BUG_ON(mc_task);
if (from == mem)
return 0;
@@ -3367,6 +3384,8 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
mc->to = mem;
mc->precharge = 0;
+ mc_task = current;
+
ret = migrate_charge_prepare();
if (ret)
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-24 5:49 ` [RFC][PATCH 6/8] memcg: avoid oom during charge migration Daisuke Nishimura
@ 2009-09-24 7:34 ` KAMEZAWA Hiroyuki
2009-09-25 1:44 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 7:34 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:49:02 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This charge migration feature has double charges on both "from" and "to"
> mem_cgroup during charge migration.
> This means unnecessary oom can happen because of charge migration.
>
> This patch tries to avoid such oom.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> ---
> mm/memcontrol.c | 19 +++++++++++++++++++
> 1 files changed, 19 insertions(+), 0 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index fbcc195..25de11c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -287,6 +287,8 @@ struct migrate_charge {
> unsigned long precharge;
> };
> static struct migrate_charge *mc;
> +static struct task_struct *mc_task;
> +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
>
> static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> while (1) {
> int ret = 0;
> unsigned long flags = 0;
> + DEFINE_WAIT(wait);
>
> if (mem_cgroup_is_root(mem))
> goto done;
> @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> if (mem_cgroup_check_under_limit(mem_over_limit))
> continue;
>
> + /* try to avoid oom while someone is migrating charge */
> + if (mc_task && current != mc_task) {
Hmm, I like
==
if (mc && mc->to == mem)
or
if (mc) {
if (mem is ancestor of mc->to)
wait for a while
==
?
> + prepare_to_wait(&mc_waitq, &wait, TASK_INTERRUPTIBLE);
> + if (mc) {
> + schedule();
> + finish_wait(&mc_waitq, &wait);
> + continue;
> + }
> + finish_wait(&mc_waitq, &wait);
> + }
> +
> if (!nr_retries--) {
> if (oom) {
> mutex_lock(&memcg_tasklist);
> @@ -3345,6 +3359,8 @@ static void mem_cgroup_clear_migrate_charge(void)
> __mem_cgroup_cancel_charge(mc->to);
> kfree(mc);
> mc = NULL;
> + mc_task = NULL;
> + wake_up_all(&mc_waitq);
> }
Hmm. I think this wake_up is too late.
How about waking up when we release page_table_lock() or
once per vma ?
Or, just skip nr_retries-- like
if (mc && mc->to_is_not_ancestor(mem) && nr_retries--) {
}
?
I think page-reclaim war itself is not bad.
(Anyway, we'll have to fix cgroup_lock if the move cost is a problem.)
Thanks,
-Kame
>
> static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> @@ -3354,6 +3370,7 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> struct mem_cgroup *from = mem_cgroup_from_task(p);
>
> VM_BUG_ON(mc);
> + VM_BUG_ON(mc_task);
>
> if (from == mem)
> return 0;
> @@ -3367,6 +3384,8 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> mc->to = mem;
> mc->precharge = 0;
>
> + mc_task = current;
> +
> ret = migrate_charge_prepare();
>
> if (ret)
> --
> 1.5.6.1
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-24 7:34 ` KAMEZAWA Hiroyuki
@ 2009-09-25 1:44 ` Daisuke Nishimura
2009-09-25 1:55 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-25 1:44 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 16:34:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 24 Sep 2009 14:49:02 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This charge migration feature has double charges on both "from" and "to"
> > mem_cgroup during charge migration.
> > This means unnecessary oom can happen because of charge migration.
> >
> > This patch tries to avoid such oom.
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > ---
> > mm/memcontrol.c | 19 +++++++++++++++++++
> > 1 files changed, 19 insertions(+), 0 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index fbcc195..25de11c 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -287,6 +287,8 @@ struct migrate_charge {
> > unsigned long precharge;
> > };
> > static struct migrate_charge *mc;
> > +static struct task_struct *mc_task;
> > +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
> >
> > static void mem_cgroup_get(struct mem_cgroup *mem);
> > static void mem_cgroup_put(struct mem_cgroup *mem);
> > @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > while (1) {
> > int ret = 0;
> > unsigned long flags = 0;
> > + DEFINE_WAIT(wait);
> >
> > if (mem_cgroup_is_root(mem))
> > goto done;
> > @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > if (mem_cgroup_check_under_limit(mem_over_limit))
> > continue;
> >
> > + /* try to avoid oom while someone is migrating charge */
> > + if (mc_task && current != mc_task) {
>
> Hmm, I like
>
> ==
> if (mc && mc->to == mem)
> or
> if (mc) {
> if (mem is ancestor of mc->to)
> wait for a while
> ==
>
> ?
>
I think we cannot access safely to mc->to w/o cgroup_lock,
and we cannot hold cgroup_lock in __mem_cgroup_try_charge.
And I think we need to check "current != mc_task" anyway to prevent
the process itself which is moving a task from being stopped.
(Thas's why I defined mc_task.)
>
> > + prepare_to_wait(&mc_waitq, &wait, TASK_INTERRUPTIBLE);
> > + if (mc) {
> > + schedule();
> > + finish_wait(&mc_waitq, &wait);
> > + continue;
> > + }
> > + finish_wait(&mc_waitq, &wait);
> > + }
> > +
> > if (!nr_retries--) {
> > if (oom) {
> > mutex_lock(&memcg_tasklist);
> > @@ -3345,6 +3359,8 @@ static void mem_cgroup_clear_migrate_charge(void)
> > __mem_cgroup_cancel_charge(mc->to);
> > kfree(mc);
> > mc = NULL;
> > + mc_task = NULL;
> > + wake_up_all(&mc_waitq);
> > }
>
> Hmm. I think this wake_up is too late.
> How about waking up when we release page_table_lock() or
> once per vma ?
>
> Or, just skip nr_retries-- like
>
> if (mc && mc->to_is_not_ancestor(mem) && nr_retries--) {
> }
>
> ?
>
I used waitq to prevent busy retries, but O.K. I'll try some.
Anyway, as I said above, we need to check "current != mc_task" and do something
to access mc->to safely.
Thanks,
Daisuke Nishimura.
> I think page-reclaim war itself is not bad.
> (Anyway, we'll have to fix cgroup_lock if the move cost is a problem.)
>
>
> Thanks,
> -Kame
>
> >
> > static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> > @@ -3354,6 +3370,7 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> > struct mem_cgroup *from = mem_cgroup_from_task(p);
> >
> > VM_BUG_ON(mc);
> > + VM_BUG_ON(mc_task);
> >
> > if (from == mem)
> > return 0;
> > @@ -3367,6 +3384,8 @@ static int mem_cgroup_can_migrate_charge(struct mem_cgroup *mem,
> > mc->to = mem;
> > mc->precharge = 0;
> >
> > + mc_task = current;
> > +
> > ret = migrate_charge_prepare();
> >
> > if (ret)
> > --
> > 1.5.6.1
> >
> >
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-25 1:44 ` Daisuke Nishimura
@ 2009-09-25 1:55 ` KAMEZAWA Hiroyuki
2009-09-25 4:51 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-25 1:55 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Fri, 25 Sep 2009 10:44:09 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Thu, 24 Sep 2009 16:34:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Thu, 24 Sep 2009 14:49:02 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> >
> > > This charge migration feature has double charges on both "from" and "to"
> > > mem_cgroup during charge migration.
> > > This means unnecessary oom can happen because of charge migration.
> > >
> > > This patch tries to avoid such oom.
> > >
> > > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > ---
> > > mm/memcontrol.c | 19 +++++++++++++++++++
> > > 1 files changed, 19 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index fbcc195..25de11c 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -287,6 +287,8 @@ struct migrate_charge {
> > > unsigned long precharge;
> > > };
> > > static struct migrate_charge *mc;
> > > +static struct task_struct *mc_task;
> > > +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
> > >
> > > static void mem_cgroup_get(struct mem_cgroup *mem);
> > > static void mem_cgroup_put(struct mem_cgroup *mem);
> > > @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > while (1) {
> > > int ret = 0;
> > > unsigned long flags = 0;
> > > + DEFINE_WAIT(wait);
> > >
> > > if (mem_cgroup_is_root(mem))
> > > goto done;
> > > @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > if (mem_cgroup_check_under_limit(mem_over_limit))
> > > continue;
> > >
> > > + /* try to avoid oom while someone is migrating charge */
> > > + if (mc_task && current != mc_task) {
> >
> > Hmm, I like
> >
> > ==
> > if (mc && mc->to == mem)
> > or
> > if (mc) {
> > if (mem is ancestor of mc->to)
> > wait for a while
> > ==
> >
> > ?
> >
> I think we cannot access safely to mc->to w/o cgroup_lock,
> and we cannot hold cgroup_lock in __mem_cgroup_try_charge.
>
> And I think we need to check "current != mc_task" anyway to prevent
> the process itself which is moving a task from being stopped.
> (Thas's why I defined mc_task.)
I think
==
static struct migrate_charge *mc;
==
should be
==
static struct migrate_charge mc;
==
Then, mc_task can be a field of mc.
And, it seems ok "don't stop a task which is under migration" .
I agreed.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-25 1:55 ` KAMEZAWA Hiroyuki
@ 2009-09-25 4:51 ` Daisuke Nishimura
2009-09-25 5:36 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-25 4:51 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Fri, 25 Sep 2009 10:55:47 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 25 Sep 2009 10:44:09 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > On Thu, 24 Sep 2009 16:34:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Thu, 24 Sep 2009 14:49:02 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > >
> > > > This charge migration feature has double charges on both "from" and "to"
> > > > mem_cgroup during charge migration.
> > > > This means unnecessary oom can happen because of charge migration.
> > > >
> > > > This patch tries to avoid such oom.
> > > >
> > > > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > > ---
> > > > mm/memcontrol.c | 19 +++++++++++++++++++
> > > > 1 files changed, 19 insertions(+), 0 deletions(-)
> > > >
> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > index fbcc195..25de11c 100644
> > > > --- a/mm/memcontrol.c
> > > > +++ b/mm/memcontrol.c
> > > > @@ -287,6 +287,8 @@ struct migrate_charge {
> > > > unsigned long precharge;
> > > > };
> > > > static struct migrate_charge *mc;
> > > > +static struct task_struct *mc_task;
> > > > +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
> > > >
> > > > static void mem_cgroup_get(struct mem_cgroup *mem);
> > > > static void mem_cgroup_put(struct mem_cgroup *mem);
> > > > @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > while (1) {
> > > > int ret = 0;
> > > > unsigned long flags = 0;
> > > > + DEFINE_WAIT(wait);
> > > >
> > > > if (mem_cgroup_is_root(mem))
> > > > goto done;
> > > > @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > if (mem_cgroup_check_under_limit(mem_over_limit))
> > > > continue;
> > > >
> > > > + /* try to avoid oom while someone is migrating charge */
> > > > + if (mc_task && current != mc_task) {
> > >
> > > Hmm, I like
> > >
> > > ==
> > > if (mc && mc->to == mem)
> > > or
> > > if (mc) {
> > > if (mem is ancestor of mc->to)
> > > wait for a while
> > > ==
> > >
> > > ?
> > >
> > I think we cannot access safely to mc->to w/o cgroup_lock,
> > and we cannot hold cgroup_lock in __mem_cgroup_try_charge.
> >
> > And I think we need to check "current != mc_task" anyway to prevent
> > the process itself which is moving a task from being stopped.
> > (Thas's why I defined mc_task.)
>
> I think
> ==
> static struct migrate_charge *mc;
> ==
> should be
> ==
> static struct migrate_charge mc;
> ==
> Then, mc_task can be a field of mc.
>
> And, it seems ok "don't stop a task which is under migration" .
> I agreed.
>
Thank you for your suggestion.
I'm now thinking as follwing.
struct move_charge {
struct mem_cgroup *from;
struct mem_cgroup *to;
struct task_struct *target; /* the target task being moved */
struct task_struct *working; /* a task moving the target task */
unsigned long precharge;
};
static struct move_charge mc;
__mem_cgroup_try_charge
{
:
if (mc.working && current != mc.working) {
(1) struct mem_cgroup *dest = mc.to;
(2) if (dest && css_is_ancestor(&dest->css, &mem_over_limit->css)
continue;
}
if (!nr_retries--) {
:
}
But considering more, there is very small race that "dest" can be freed by rmdir
between (1) or (2), IIUC.
Do you have any ideas ?
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-25 4:51 ` Daisuke Nishimura
@ 2009-09-25 5:36 ` Daisuke Nishimura
2009-09-25 5:52 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-25 5:36 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Fri, 25 Sep 2009 13:51:28 +0900, Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Fri, 25 Sep 2009 10:55:47 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Fri, 25 Sep 2009 10:44:09 +0900
> > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> >
> > > On Thu, 24 Sep 2009 16:34:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > On Thu, 24 Sep 2009 14:49:02 +0900
> > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > >
> > > > > This charge migration feature has double charges on both "from" and "to"
> > > > > mem_cgroup during charge migration.
> > > > > This means unnecessary oom can happen because of charge migration.
> > > > >
> > > > > This patch tries to avoid such oom.
> > > > >
> > > > > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > > > ---
> > > > > mm/memcontrol.c | 19 +++++++++++++++++++
> > > > > 1 files changed, 19 insertions(+), 0 deletions(-)
> > > > >
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index fbcc195..25de11c 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -287,6 +287,8 @@ struct migrate_charge {
> > > > > unsigned long precharge;
> > > > > };
> > > > > static struct migrate_charge *mc;
> > > > > +static struct task_struct *mc_task;
> > > > > +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
> > > > >
> > > > > static void mem_cgroup_get(struct mem_cgroup *mem);
> > > > > static void mem_cgroup_put(struct mem_cgroup *mem);
> > > > > @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > > while (1) {
> > > > > int ret = 0;
> > > > > unsigned long flags = 0;
> > > > > + DEFINE_WAIT(wait);
> > > > >
> > > > > if (mem_cgroup_is_root(mem))
> > > > > goto done;
> > > > > @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > > if (mem_cgroup_check_under_limit(mem_over_limit))
> > > > > continue;
> > > > >
> > > > > + /* try to avoid oom while someone is migrating charge */
> > > > > + if (mc_task && current != mc_task) {
> > > >
> > > > Hmm, I like
> > > >
> > > > ==
> > > > if (mc && mc->to == mem)
> > > > or
> > > > if (mc) {
> > > > if (mem is ancestor of mc->to)
> > > > wait for a while
> > > > ==
> > > >
> > > > ?
> > > >
> > > I think we cannot access safely to mc->to w/o cgroup_lock,
> > > and we cannot hold cgroup_lock in __mem_cgroup_try_charge.
> > >
> > > And I think we need to check "current != mc_task" anyway to prevent
> > > the process itself which is moving a task from being stopped.
> > > (Thas's why I defined mc_task.)
> >
> > I think
> > ==
> > static struct migrate_charge *mc;
> > ==
> > should be
> > ==
> > static struct migrate_charge mc;
> > ==
> > Then, mc_task can be a field of mc.
> >
> > And, it seems ok "don't stop a task which is under migration" .
> > I agreed.
> >
> Thank you for your suggestion.
>
> I'm now thinking as follwing.
>
>
> struct move_charge {
> struct mem_cgroup *from;
> struct mem_cgroup *to;
> struct task_struct *target; /* the target task being moved */
> struct task_struct *working; /* a task moving the target task */
> unsigned long precharge;
> };
> static struct move_charge mc;
>
> __mem_cgroup_try_charge
> {
> :
> if (mc.working && current != mc.working) {
> (1) struct mem_cgroup *dest = mc.to;
> (2) if (dest && css_is_ancestor(&dest->css, &mem_over_limit->css)
> continue;
> }
>
> if (!nr_retries--) {
> :
> }
>
>
> But considering more, there is very small race that "dest" can be freed by rmdir
> between (1) or (2), IIUC.
>
> Do you have any ideas ?
>
IIUC, calling css_tryget() under rcu_read_lock() would work.
Thanks,
Daisuke Nishimura.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 6/8] memcg: avoid oom during charge migration
2009-09-25 5:36 ` Daisuke Nishimura
@ 2009-09-25 5:52 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-25 5:52 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Fri, 25 Sep 2009 14:36:09 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> On Fri, 25 Sep 2009 13:51:28 +0900, Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > On Fri, 25 Sep 2009 10:55:47 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Fri, 25 Sep 2009 10:44:09 +0900
> > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > >
> > > > On Thu, 24 Sep 2009 16:34:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > > > On Thu, 24 Sep 2009 14:49:02 +0900
> > > > > Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> > > > >
> > > > > > This charge migration feature has double charges on both "from" and "to"
> > > > > > mem_cgroup during charge migration.
> > > > > > This means unnecessary oom can happen because of charge migration.
> > > > > >
> > > > > > This patch tries to avoid such oom.
> > > > > >
> > > > > > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > > > > > ---
> > > > > > mm/memcontrol.c | 19 +++++++++++++++++++
> > > > > > 1 files changed, 19 insertions(+), 0 deletions(-)
> > > > > >
> > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > > index fbcc195..25de11c 100644
> > > > > > --- a/mm/memcontrol.c
> > > > > > +++ b/mm/memcontrol.c
> > > > > > @@ -287,6 +287,8 @@ struct migrate_charge {
> > > > > > unsigned long precharge;
> > > > > > };
> > > > > > static struct migrate_charge *mc;
> > > > > > +static struct task_struct *mc_task;
> > > > > > +static DECLARE_WAIT_QUEUE_HEAD(mc_waitq);
> > > > > >
> > > > > > static void mem_cgroup_get(struct mem_cgroup *mem);
> > > > > > static void mem_cgroup_put(struct mem_cgroup *mem);
> > > > > > @@ -1317,6 +1319,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > > > while (1) {
> > > > > > int ret = 0;
> > > > > > unsigned long flags = 0;
> > > > > > + DEFINE_WAIT(wait);
> > > > > >
> > > > > > if (mem_cgroup_is_root(mem))
> > > > > > goto done;
> > > > > > @@ -1358,6 +1361,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > > > > > if (mem_cgroup_check_under_limit(mem_over_limit))
> > > > > > continue;
> > > > > >
> > > > > > + /* try to avoid oom while someone is migrating charge */
> > > > > > + if (mc_task && current != mc_task) {
> > > > >
> > > > > Hmm, I like
> > > > >
> > > > > ==
> > > > > if (mc && mc->to == mem)
> > > > > or
> > > > > if (mc) {
> > > > > if (mem is ancestor of mc->to)
> > > > > wait for a while
> > > > > ==
> > > > >
> > > > > ?
> > > > >
> > > > I think we cannot access safely to mc->to w/o cgroup_lock,
> > > > and we cannot hold cgroup_lock in __mem_cgroup_try_charge.
> > > >
> > > > And I think we need to check "current != mc_task" anyway to prevent
> > > > the process itself which is moving a task from being stopped.
> > > > (Thas's why I defined mc_task.)
> > >
> > > I think
> > > ==
> > > static struct migrate_charge *mc;
> > > ==
> > > should be
> > > ==
> > > static struct migrate_charge mc;
> > > ==
> > > Then, mc_task can be a field of mc.
> > >
> > > And, it seems ok "don't stop a task which is under migration" .
> > > I agreed.
> > >
> > Thank you for your suggestion.
> >
> > I'm now thinking as follwing.
> >
> >
> > struct move_charge {
> > struct mem_cgroup *from;
> > struct mem_cgroup *to;
> > struct task_struct *target; /* the target task being moved */
> > struct task_struct *working; /* a task moving the target task */
> > unsigned long precharge;
> > };
> > static struct move_charge mc;
> >
> > __mem_cgroup_try_charge
> > {
> > :
> > if (mc.working && current != mc.working) {
> > (1) struct mem_cgroup *dest = mc.to;
> > (2) if (dest && css_is_ancestor(&dest->css, &mem_over_limit->css)
> > continue;
> > }
> >
> > if (!nr_retries--) {
> > :
> > }
> >
> >
> > But considering more, there is very small race that "dest" can be freed by rmdir
> > between (1) or (2), IIUC.
> >
> > Do you have any ideas ?
> >
> IIUC, calling css_tryget() under rcu_read_lock() would work.
>
Ah, yes. I think it will work.
Or. add a field
mem_cgroup.under_immigration
And,
set mem_cgroup.under_immigration = true (all ancestors)
before starting immigration. And check this flag in try_charge().
Anyway, Only one immigiration can run at the same time.
Thanks,
-Kame
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [RFC][PATCH 7/8] memcg: migrate charge of swap
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
` (5 preceding siblings ...)
2009-09-24 5:49 ` [RFC][PATCH 6/8] memcg: avoid oom during charge migration Daisuke Nishimura
@ 2009-09-24 5:49 ` Daisuke Nishimura
2009-09-24 5:50 ` [RFC][PATCH 8/8] memcg: migrate charge of shmem swap Daisuke Nishimura
7 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:49 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch is another core part of this charge migration feature.
It enables charge migration of swap.
To move the charge of swap, we need to exchange swap_cgroup's record.
In current implementation, swap_cgroup's record is protected by:
- page lock: if the entry is on swap cache.
- swap_lock: if the entry is not on swap cache.
This works well in usual swap-in/out activity.
But this behavior make charge migration of swap check many conditions to
exchange swap_cgroup's record safely.
So I changed modification of swap_cgroup's recored(swap_cgroup_record())
to use xchg, and define a new function to cmpxchg swap_cgroup's record.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/page_cgroup.h | 2 +
mm/memcontrol.c | 104 ++++++++++++++++++++++++++++++++++++++-----
mm/page_cgroup.c | 35 ++++++++++++++-
3 files changed, 128 insertions(+), 13 deletions(-)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index fffa835..72f548e 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -117,6 +117,8 @@ static inline void __init page_cgroup_init_flatmem(void)
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
#include <linux/swap.h>
+extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25de11c..fe0902c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -34,6 +34,7 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
@@ -2013,6 +2014,49 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
}
rcu_read_unlock();
}
+
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ *
+ * It successes only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, 1 on failure.
+ *
+ * The caller must have called __mem_cgroup_try_charge on @to.
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ unsigned short old_id, new_id;
+
+ old_id = css_id(&from->css);
+ new_id = css_id(&to->css);
+
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ if (!mem_cgroup_is_root(from))
+ res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+ mem_cgroup_swap_statistics(from, false);
+ mem_cgroup_put(from);
+
+ if (!mem_cgroup_is_root(to))
+ res_counter_uncharge(&to->res, PAGE_SIZE, NULL);
+ mem_cgroup_swap_statistics(to, true);
+ mem_cgroup_get(to);
+
+ return 0;
+ }
+ return 1;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ return 1;
+}
#endif
/*
@@ -3231,41 +3275,62 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
- * @target: the pointer the target page will be stored(can be NULL)
+ * @target: the pointer the target page or entry will be stored(can be NULL)
*
* Returns
* 0(MIGRATION_TARGET_NONE): if the pte is not a target for charge migration.
* 1(MIGRATION_TARGET_PAGE): if the page corresponding to this pte is a target
* for charge migration. if @target is not NULL, the page is stored in
* target->page with extra refcnt got(Callers should handle it).
+ * 2(MIGRATION_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
*
* Called with pte lock held.
*/
union migration_target {
struct page *page;
+ swp_entry_t ent;
};
enum migration_target_type {
MIGRATION_TARGET_NONE, /* not used */
MIGRATION_TARGET_PAGE,
+ MIGRATION_TARGET_SWAP,
};
static int is_target_pte_for_migration(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union migration_target *target)
{
- struct page *page;
+ struct page *page = NULL;
struct page_cgroup *pc;
+ swp_entry_t ent = { .val = 0 };
int ret = 0;
- if (!pte_present(ptent))
- return 0;
-
- page = vm_normal_page(vma, addr, ptent);
- if (!page || !page_mapped(page))
- return 0;
- if (!get_page_unless_zero(page))
- return 0;
-
+ if (!pte_present(ptent)) {
+ if (!do_swap_account)
+ return 0;
+ /* TODO: handle swap of shmes/tmpfs */
+ if (pte_none(ptent) || pte_file(ptent))
+ return 0;
+ else if (is_swap_pte(ptent)) {
+ ent = pte_to_swp_entry(ptent);
+ if (is_migration_entry(ent))
+ return 0;
+ page = find_get_page(&swapper_space, ent.val);
+ }
+ if (page)
+ goto check_page;
+ else
+ goto check_swap;
+ } else {
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page || !page_mapped(page))
+ return 0;
+ if (!get_page_unless_zero(page))
+ return 0;
+ }
+check_page:
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc->from) {
@@ -3277,6 +3342,14 @@ static int is_target_pte_for_migration(struct vm_area_struct *vma,
if (!ret || !target)
put_page(page);
+ /* fall throught */
+check_swap:
+ if (ent.val)
+ if (!ret && css_id(&mc->from->css) == lookup_swap_cgroup(ent)) {
+ ret = MIGRATION_TARGET_SWAP;
+ if (target)
+ target->ent = ent;
+ }
return ret;
}
@@ -3431,6 +3504,7 @@ retry:
int type;
struct page *page;
struct page_cgroup *pc;
+ swp_entry_t ent;
if (!mc->precharge)
break;
@@ -3450,6 +3524,14 @@ retry:
put: /* is_target_pte_for_migration() gets the page */
put_page(page);
break;
+ case MIGRATION_TARGET_SWAP:
+ ent = target.ent;
+ if (!mem_cgroup_move_swap_account(ent,
+ mc->from, mc->to)) {
+ css_put(&mc->to->css);
+ mc->precharge--;
+ }
+ break;
default:
continue;
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d5..9532169 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <asm/cmpxchg.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -335,6 +336,37 @@ not_enough_page:
}
/**
+ * swap_cgroupo_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @end: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup useing 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ int type = swp_type(ent);
+ unsigned long offset = swp_offset(ent);
+ unsigned long idx = offset / SC_PER_PAGE;
+ unsigned long pos = offset & SC_POS_MASK;
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ mappage = ctrl->map[idx];
+ sc = page_address(mappage);
+ sc += pos;
+ if (cmpxchg(&sc->id, old, new) == old)
+ return old;
+ else
+ return 0;
+}
+
+/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
* @mem: mem_cgroup to be recorded
@@ -358,8 +390,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
mappage = ctrl->map[idx];
sc = page_address(mappage);
sc += pos;
- old = sc->id;
- sc->id = id;
+ old = xchg(&sc->id, id);
return old;
}
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* [RFC][PATCH 8/8] memcg: migrate charge of shmem swap
2009-09-24 5:42 ` [RFC][PATCH 0/8] memcg: migrate charge at task move (24/Sep) Daisuke Nishimura
` (6 preceding siblings ...)
2009-09-24 5:49 ` [RFC][PATCH 7/8] memcg: migrate charge of swap Daisuke Nishimura
@ 2009-09-24 5:50 ` Daisuke Nishimura
2009-09-24 7:41 ` KAMEZAWA Hiroyuki
7 siblings, 1 reply; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-24 5:50 UTC (permalink / raw)
To: linux-mm
Cc: Balbir Singh, KAMEZAWA Hiroyuki, Paul Menage, Li Zefan,
Daisuke Nishimura
This patch enables charge migration of shmem's swap.
To find the shmem's page or swap entry corresponding to a !pte_present pte,
this patch add a function to search them from the inode and the offset.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
---
include/linux/swap.h | 4 ++++
mm/memcontrol.c | 21 +++++++++++++++++----
mm/shmem.c | 37 +++++++++++++++++++++++++++++++++++++
3 files changed, 58 insertions(+), 4 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4ec9001..e232653 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -278,6 +278,10 @@ extern int kswapd_run(int nid);
/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
#endif /* CONFIG_MMU */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent);
+#endif
extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe0902c..1c674b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3310,10 +3310,23 @@ static int is_target_pte_for_migration(struct vm_area_struct *vma,
if (!pte_present(ptent)) {
if (!do_swap_account)
return 0;
- /* TODO: handle swap of shmes/tmpfs */
- if (pte_none(ptent) || pte_file(ptent))
- return 0;
- else if (is_swap_pte(ptent)) {
+ if (pte_none(ptent) || pte_file(ptent)) {
+ if (!vma->vm_file)
+ return 0;
+ if (mapping_cap_swap_backed(vma->vm_file->f_mapping)) {
+ struct inode *inode;
+ pgoff_t pgoff = 0;
+
+ inode = vma->vm_file->f_path.dentry->d_inode;
+ if (pte_none(ptent))
+ pgoff = linear_page_index(vma, addr);
+ if (pte_file(ptent))
+ pgoff = pte_to_pgoff(ptent);
+
+ mem_cgroup_get_shmem_target(inode, pgoff,
+ &page, &ent);
+ }
+ } else if (is_swap_pte(ptent)) {
ent = pte_to_swp_entry(ptent);
if (is_migration_entry(ent))
return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 10b7f37..96bc1b7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2714,3 +2714,40 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_ops = &shmem_vm_ops;
return 0;
}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
+ * @inode: the inode to be searched
+ * @pgoff: the offset to be searched
+ * @pagep: the pointer for the found page to be stored
+ * @ent: the pointer for the found swap entry to be stored
+ *
+ * If a page is found, refcount of it is incremented. Callers should handle
+ * these refcount.
+ */
+void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
+ struct page **pagep, swp_entry_t *ent)
+{
+ swp_entry_t entry = { .val = 0 }, *ptr;
+ struct page *page = NULL;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ goto out;
+
+ spin_lock(&info->lock);
+ ptr = shmem_swp_entry(info, pgoff, NULL);
+ if (ptr && ptr->val) {
+ entry.val = ptr->val;
+ page = find_get_page(&swapper_space, entry.val);
+ } else
+ page = find_get_page(inode->i_mapping, pgoff);
+ if (ptr)
+ shmem_swp_unmap(ptr);
+ spin_unlock(&info->lock);
+out:
+ *pagep = page;
+ *ent = entry;
+}
+#endif
--
1.5.6.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 8/8] memcg: migrate charge of shmem swap
2009-09-24 5:50 ` [RFC][PATCH 8/8] memcg: migrate charge of shmem swap Daisuke Nishimura
@ 2009-09-24 7:41 ` KAMEZAWA Hiroyuki
2009-09-25 0:28 ` Daisuke Nishimura
0 siblings, 1 reply; 49+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-09-24 7:41 UTC (permalink / raw)
To: Daisuke Nishimura; +Cc: linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 14:50:41 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> This patch enables charge migration of shmem's swap.
>
> To find the shmem's page or swap entry corresponding to a !pte_present pte,
> this patch add a function to search them from the inode and the offset.
>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
I think it's not good to recharge shmem pages based on tasks while
we don't do it against file caches.
I recommend you to implement madivce() for recharging file caches or
shmem. Maybe there will use cases to isoalte some files/shmems's charge
to some special groups.
Thanks,
-Kame
> ---
> include/linux/swap.h | 4 ++++
> mm/memcontrol.c | 21 +++++++++++++++++----
> mm/shmem.c | 37 +++++++++++++++++++++++++++++++++++++
> 3 files changed, 58 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 4ec9001..e232653 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -278,6 +278,10 @@ extern int kswapd_run(int nid);
> /* linux/mm/shmem.c */
> extern int shmem_unuse(swp_entry_t entry, struct page *page);
> #endif /* CONFIG_MMU */
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
> + struct page **pagep, swp_entry_t *ent);
> +#endif
>
> extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index fe0902c..1c674b0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3310,10 +3310,23 @@ static int is_target_pte_for_migration(struct vm_area_struct *vma,
> if (!pte_present(ptent)) {
> if (!do_swap_account)
> return 0;
> - /* TODO: handle swap of shmes/tmpfs */
> - if (pte_none(ptent) || pte_file(ptent))
> - return 0;
> - else if (is_swap_pte(ptent)) {
> + if (pte_none(ptent) || pte_file(ptent)) {
> + if (!vma->vm_file)
> + return 0;
> + if (mapping_cap_swap_backed(vma->vm_file->f_mapping)) {
> + struct inode *inode;
> + pgoff_t pgoff = 0;
> +
> + inode = vma->vm_file->f_path.dentry->d_inode;
> + if (pte_none(ptent))
> + pgoff = linear_page_index(vma, addr);
> + if (pte_file(ptent))
> + pgoff = pte_to_pgoff(ptent);
> +
> + mem_cgroup_get_shmem_target(inode, pgoff,
> + &page, &ent);
> + }
> + } else if (is_swap_pte(ptent)) {
> ent = pte_to_swp_entry(ptent);
> if (is_migration_entry(ent))
> return 0;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 10b7f37..96bc1b7 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2714,3 +2714,40 @@ int shmem_zero_setup(struct vm_area_struct *vma)
> vma->vm_ops = &shmem_vm_ops;
> return 0;
> }
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +/**
> + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
> + * @inode: the inode to be searched
> + * @pgoff: the offset to be searched
> + * @pagep: the pointer for the found page to be stored
> + * @ent: the pointer for the found swap entry to be stored
> + *
> + * If a page is found, refcount of it is incremented. Callers should handle
> + * these refcount.
> + */
> +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
> + struct page **pagep, swp_entry_t *ent)
> +{
> + swp_entry_t entry = { .val = 0 }, *ptr;
> + struct page *page = NULL;
> + struct shmem_inode_info *info = SHMEM_I(inode);
> +
> + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
> + goto out;
> +
> + spin_lock(&info->lock);
> + ptr = shmem_swp_entry(info, pgoff, NULL);
> + if (ptr && ptr->val) {
> + entry.val = ptr->val;
> + page = find_get_page(&swapper_space, entry.val);
> + } else
> + page = find_get_page(inode->i_mapping, pgoff);
> + if (ptr)
> + shmem_swp_unmap(ptr);
> + spin_unlock(&info->lock);
> +out:
> + *pagep = page;
> + *ent = entry;
> +}
> +#endif
> --
> 1.5.6.1
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread* Re: [RFC][PATCH 8/8] memcg: migrate charge of shmem swap
2009-09-24 7:41 ` KAMEZAWA Hiroyuki
@ 2009-09-25 0:28 ` Daisuke Nishimura
0 siblings, 0 replies; 49+ messages in thread
From: Daisuke Nishimura @ 2009-09-25 0:28 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: nishimura, linux-mm, Balbir Singh, Paul Menage, Li Zefan
On Thu, 24 Sep 2009 16:41:31 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Thu, 24 Sep 2009 14:50:41 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
>
> > This patch enables charge migration of shmem's swap.
> >
> > To find the shmem's page or swap entry corresponding to a !pte_present pte,
> > this patch add a function to search them from the inode and the offset.
> >
> > Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
> I think it's not good to recharge shmem pages based on tasks while
> we don't do it against file caches.
>
In current implementation, shmem pages or file caches which in on pte(pte_present()),
will be recharged.
But in case of !pte_present, hmm, you're right. File caches are not handled by this patch.
I think I can handle it by doing like:
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
if (pte_file(ptent))
pgoff = pte_to_pgoff(pte);
page = find_get_page(vma->vm_file->f_mapping, pgoff);
as mincore does.
Or, I'll change this patch(perhaps mem_cgroup_get_shmem_target()) to handle
only swap.
I preffer the former.
At least, I don't want to ignore shmem's swap.
> I recommend you to implement madivce() for recharging file caches or
> shmem. Maybe there will use cases to isoalte some files/shmems's charge
> to some special groups.
>
Do you mean MADV_MEMCG_DO/DONT_RECHARGE or some ?
If so, I think it would be make sense(I think it might be used for anon pages too).
Thanks,
Daisuke Nishimura.
> Thanks,
> -Kame
>
>
> > ---
> > include/linux/swap.h | 4 ++++
> > mm/memcontrol.c | 21 +++++++++++++++++----
> > mm/shmem.c | 37 +++++++++++++++++++++++++++++++++++++
> > 3 files changed, 58 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 4ec9001..e232653 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -278,6 +278,10 @@ extern int kswapd_run(int nid);
> > /* linux/mm/shmem.c */
> > extern int shmem_unuse(swp_entry_t entry, struct page *page);
> > #endif /* CONFIG_MMU */
> > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > +extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
> > + struct page **pagep, swp_entry_t *ent);
> > +#endif
> >
> > extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index fe0902c..1c674b0 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -3310,10 +3310,23 @@ static int is_target_pte_for_migration(struct vm_area_struct *vma,
> > if (!pte_present(ptent)) {
> > if (!do_swap_account)
> > return 0;
> > - /* TODO: handle swap of shmes/tmpfs */
> > - if (pte_none(ptent) || pte_file(ptent))
> > - return 0;
> > - else if (is_swap_pte(ptent)) {
> > + if (pte_none(ptent) || pte_file(ptent)) {
> > + if (!vma->vm_file)
> > + return 0;
> > + if (mapping_cap_swap_backed(vma->vm_file->f_mapping)) {
> > + struct inode *inode;
> > + pgoff_t pgoff = 0;
> > +
> > + inode = vma->vm_file->f_path.dentry->d_inode;
> > + if (pte_none(ptent))
> > + pgoff = linear_page_index(vma, addr);
> > + if (pte_file(ptent))
> > + pgoff = pte_to_pgoff(ptent);
> > +
> > + mem_cgroup_get_shmem_target(inode, pgoff,
> > + &page, &ent);
> > + }
> > + } else if (is_swap_pte(ptent)) {
> > ent = pte_to_swp_entry(ptent);
> > if (is_migration_entry(ent))
> > return 0;
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 10b7f37..96bc1b7 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2714,3 +2714,40 @@ int shmem_zero_setup(struct vm_area_struct *vma)
> > vma->vm_ops = &shmem_vm_ops;
> > return 0;
> > }
> > +
> > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > +/**
> > + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
> > + * @inode: the inode to be searched
> > + * @pgoff: the offset to be searched
> > + * @pagep: the pointer for the found page to be stored
> > + * @ent: the pointer for the found swap entry to be stored
> > + *
> > + * If a page is found, refcount of it is incremented. Callers should handle
> > + * these refcount.
> > + */
> > +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
> > + struct page **pagep, swp_entry_t *ent)
> > +{
> > + swp_entry_t entry = { .val = 0 }, *ptr;
> > + struct page *page = NULL;
> > + struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
> > + goto out;
> > +
> > + spin_lock(&info->lock);
> > + ptr = shmem_swp_entry(info, pgoff, NULL);
> > + if (ptr && ptr->val) {
> > + entry.val = ptr->val;
> > + page = find_get_page(&swapper_space, entry.val);
> > + } else
> > + page = find_get_page(inode->i_mapping, pgoff);
> > + if (ptr)
> > + shmem_swp_unmap(ptr);
> > + spin_unlock(&info->lock);
> > +out:
> > + *pagep = page;
> > + *ent = entry;
> > +}
> > +#endif
> > --
> > 1.5.6.1
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 49+ messages in thread